all realize 2 (#4527)

* all realize 2 * tests fixup * fix more tests * fix openpilot * fix tests * unneeded
2026-06-11 07:27:43 +08:00 · 2024-05-10 22:43:09 -07:00
parent d2c347fc74
commit 2f970a4fc2
21 changed files with 142 additions and 139 deletions
--- a/docs-legacy/abstractions2.py
+++ b/docs-legacy/abstractions2.py
@@ -55,11 +55,12 @@ alu = LazyOp(BinaryOps.ADD, (ld_1, ld_2))
 st_0 = LazyOp(BufferOps.STORE, (alu,), MemBuffer(0, dtypes.int32, ShapeTracker.from_shape((1,))))

 # convert the computation to a "linearized" format (print the format)
-lin = Device[DEVICE].get_linearizer(st_0).linearize()
+from tinygrad.engine.realize import get_linearizer, CompiledRunner
+lin = get_linearizer(Device[DEVICE].renderer, (st_0,)).linearize()
 for u in lin.uops: print(u)

 # compile a program (and print the source)
-fxn = Device[DEVICE].to_runner(lin)
+fxn = CompiledRunner(lin.to_program())
 print(fxn.p.src)
 # NOTE: fxn.clprg is the ClangProgram

--- a/extra/export_model.py
+++ b/extra/export_model.py
@@ -1,6 +1,6 @@
 from typing import Tuple, Dict, List
 from tinygrad.dtype import DType
-from tinygrad.device import Program
+from tinygrad.renderer import Program
 from tinygrad.tensor import Device, Tensor
 from tinygrad.engine.jit import TinyJit
 from tinygrad.nn.state import get_state_dict
--- a/openpilot/compile2.py
+++ b/openpilot/compile2.py
@@ -15,9 +15,9 @@ from typing import Tuple, List, Optional, Dict, cast
 from extra.onnx import get_run_onnx
 from tinygrad import Tensor, Device, GlobalCounters, dtypes
 from tinygrad.dtype import ImageDType
-from tinygrad.device import CompiledRunner, Buffer
+from tinygrad.device import Buffer
 from tinygrad.helpers import partition, Context, fetch, getenv, DEBUG
-from tinygrad.engine.realize import run_schedule, lower_schedule, ExecItem
+from tinygrad.engine.realize import run_schedule, lower_schedule, ExecItem, CompiledRunner
 from tinygrad.engine.memory import memory_planner
 from tinygrad.engine.schedule import create_schedule
 from tinygrad.ops import LoadOps, ScheduleItem
--- a/test/external/external_test_speed_llama.py
+++ b/test/external/external_test_speed_llama.py
@@ -4,7 +4,8 @@ from examples.llama import Transformer, MODEL_PARAMS
 from tinygrad.tensor import Tensor
 from tinygrad import Device
 from tinygrad.nn.state import get_state_dict
-from tinygrad.device import Allocator, method_cache
+from tinygrad.device import Allocator
+from tinygrad.engine.realize import method_cache
 from tinygrad.helpers import Profiling

 class FakeProgram:
--- a/test/external/fuzz_linearizer.py
+++ b/test/external/fuzz_linearizer.py
@@ -9,6 +9,7 @@ from tinygrad.codegen.linearizer import Linearizer, UOp
 from tinygrad.codegen.kernel import Opt, OptOps
 from tinygrad.features.search import get_linearizer_actions, bufs_from_lin
 from tinygrad.features.graph import print_tree
+from tinygrad.engine.realize import CompiledRunner
 from tinygrad.helpers import getenv, from_mv, prod, colored, Context, DEBUG
 from tinygrad.ops import LazyOp, UnaryOps, BufferOps

@@ -55,7 +56,7 @@ def run_linearizer(lin: Linearizer, rawbufs=None, var_vals=None):

  # TODO: images needs required_optimization
  try:
-    prg = device.to_runner(lin)
+    prg = CompiledRunner(lin.to_program())
  except Exception:
    traceback.print_exc()
    return "COMPILE_ERROR"
--- a/test/helpers.py
+++ b/test/helpers.py
@@ -1,7 +1,7 @@
 import sys
 import numpy as np
 from tinygrad import Tensor, Device, dtypes
-from tinygrad.device import Runner
+from tinygrad.engine.realize import Runner
 from tinygrad.dtype import DType
 from tinygrad.nn.state import get_parameters
 from tinygrad.helpers import Context, CI, OSX, getenv
--- a/test/test_custom_function.py
+++ b/test/test_custom_function.py
@@ -10,8 +10,10 @@ from tinygrad.dtype import dtypes
 # *** first, we implement the atan2 op at the lowest level ***
 # `atan2_gpu` for GPUBuffers and `atan2_cpu` for CPUBuffers
 from tinygrad.lazy import Buffer, create_lazybuffer
-from tinygrad.device import CompiledRunner, Device, Program
+from tinygrad.device import Device
 from tinygrad.shape.shapetracker import ShapeTracker
+from tinygrad.engine.realize import CompiledRunner
+from tinygrad.renderer import Program

 # we don't always have GPU support, so the type signature is the abstract CompiledBuffer instead of GPUBuffer
 def atan2_gpu(ret:Buffer, a:Buffer, b:Buffer):
--- a/test/test_linearizer.py
+++ b/test/test_linearizer.py
@@ -1,5 +1,6 @@
 import numpy as np
 import unittest
+from dataclasses import replace

 from tinygrad.codegen.kernel import Opt, OptOps, KernelOptError, tensor_cores
 from tinygrad.codegen.linearizer import Linearizer, UOp, UOps, expand_node, expand_idxs
@@ -10,7 +11,7 @@ from tinygrad.shape.view import View
 from tinygrad.shape.symbolic import MulNode, Variable, NumNode, Node
 from tinygrad.tensor import Tensor
 from tinygrad.engine.schedule import create_schedule
-from tinygrad.engine.realize import run_schedule, lower_schedule
+from tinygrad.engine.realize import run_schedule, lower_schedule, CompiledRunner
 from tinygrad.helpers import prod, Context, getenv, CI
 from tinygrad.dtype import DType, dtypes
 from tinygrad.codegen.uops import UOpGraph
@@ -269,7 +270,7 @@ class TestLinearizer(unittest.TestCase):
        assert len([uop for uop in k.uops if uop.uop is UOps.WMMA]) > 0, "tensor core not triggered"
        assert len([x for x in k.applied_opts if x.op is OptOps.TC]) == 1, "tensor core opt not included"

-        prg = Device[Device.DEFAULT].to_runner(k)
+        prg = CompiledRunner(k.to_program())
        real_bufs[0].copyin(np.zeros((real_bufs[0].size, ), dtype=real_bufs[0].dtype.np).data) # Zero to check that all values are filled
        prg.exec(real_bufs)
        result = np.frombuffer(real_bufs[0].as_buffer(), real_bufs[0].dtype.np)
@@ -586,7 +587,9 @@ def helper_linearizer_opt(r:Tensor, opts=[], apply_tc=False, atol=1e-4, rtol=1e-
  wanna_output = None
  realized_ast, real_bufs = helper_realized_ast(r)

-  def check_opt(opts, create_k, to_prg, expected_color_size):
+  def get_prg(k:Linearizer): return CompiledRunner(replace(k.to_program(), dname=Device.DEFAULT))
+
+  def check_opt(opts, create_k, expected_color_size):
    k = create_k()
    if apply_tc:
      assert k.apply_tensor_cores(1, extra_opts=opts), "no tensor core triggered"
@@ -595,26 +598,26 @@ def helper_linearizer_opt(r:Tensor, opts=[], apply_tc=False, atol=1e-4, rtol=1e-
        k.apply_opt(opt)
    if expected_color_size is not None:
      assert (cs:=[(x,y) for x,y in zip(k.colors(), k.full_shape)]) == expected_color_size, f"expected={expected_color_size} got={cs}"
-    prg = to_prg(k)
+    prg = get_prg(k)
    real_bufs[0].copyin(np.zeros((real_bufs[0].size, ), dtype=real_bufs[0].dtype.np).data) # Zero to check that all values are filled
    prg.exec(real_bufs)
    np.testing.assert_allclose(np.frombuffer(real_bufs[0].as_buffer(), real_bufs[0].dtype.np), wanna_output, atol=atol, rtol=rtol)

  # Get baseline, which is not optimized at all.
  k = Linearizer(realized_ast)
-  prg = Device[Device.DEFAULT].to_runner(k)
+  prg = get_prg(k)
  prg.exec(real_bufs)
  wanna_output = np.frombuffer(real_bufs[0].as_buffer(), real_bufs[0].dtype.np).copy()

  # Check correctness of handcoded optimiztions.
  k = Linearizer(realized_ast)
  k.hand_coded_optimizations()
-  prg = Device[Device.DEFAULT].to_runner(k)
+  prg = get_prg(k)
  real_bufs[0].copyin(np.zeros((real_bufs[0].size, ), dtype=real_bufs[0].dtype.np).data) # Zero to check that all values are filled
  prg.exec(real_bufs)
  np.testing.assert_allclose(wanna_output, np.frombuffer(real_bufs[0].as_buffer(), real_bufs[0].dtype.np), atol=atol, rtol=rtol)
  for i, x in enumerate(opts): # Check custom transformations if any.
-    check_opt(x, lambda: Linearizer(realized_ast), Device[Device.DEFAULT].to_runner, color_sizes[i] if i < len(color_sizes) else None)
+    check_opt(x, lambda: Linearizer(realized_ast), color_sizes[i] if i < len(color_sizes) else None)

 class TestKernelOpts(unittest.TestCase):
  def test_local_and_grouped_reduce(self):
--- a/test/test_multitensor.py
+++ b/test/test_multitensor.py
@@ -1,12 +1,11 @@
 import unittest, functools, random
 from typing import List
 from tinygrad import Tensor, Device, nn, GlobalCounters, TinyJit, dtypes
-from tinygrad.device import CompiledRunner
 from tinygrad.ops import LoadOps, ReduceOps
 from tinygrad.helpers import CI, prod, Context
 from tinygrad.nn.state import get_parameters, get_state_dict
 from tinygrad.engine.schedule import create_schedule
-from tinygrad.engine.realize import lower_schedule, BufferCopy
+from tinygrad.engine.realize import lower_schedule, BufferCopy, CompiledRunner
 from tinygrad.features.multi import all_reduce, MultiLazyBuffer
 from random import randint
 import numpy as np
--- a/test/test_uops.py
+++ b/test/test_uops.py
@@ -4,9 +4,11 @@ import numpy as np
 from tinygrad.tensor import Tensor
 from tinygrad.helpers import getenv
 from tinygrad.dtype import dtypes, DType, PtrDType
-from tinygrad.device import Buffer, Device, CompiledRunner, Program
+from tinygrad.device import Buffer, Device
 from tinygrad.ops import UnaryOps, BinaryOps, TernaryOps
+from tinygrad.renderer import Program
 from tinygrad.engine.schedule import create_schedule
+from tinygrad.engine.realize import CompiledRunner, lower_schedule_item
 from tinygrad.codegen.linearizer import UOps, UOp
 from tinygrad.codegen.uops import exec_alu, UOpGraph
 from test.helpers import is_dtype_supported
@@ -210,9 +212,8 @@ class TestConstantFolding(unittest.TestCase):
    t = Tensor(1, dtype=dtypes.float).bitcast(dtypes.int)
    si = create_schedule([t.lazydata])
    assert len(si) == 1
-    si = si[0]
-    lin = Device[Device.DEFAULT].get_linearizer(si.ast[0]).linearize()
-    assert any(uop.uop is UOps.BITCAST for uop in lin.uops.uops), f"{[uop.uop for uop in lin.uops.uops]} does not contain bitcast"
+    ji = lower_schedule_item(si[-1])
+    assert any(uop.uop is UOps.BITCAST for uop in ji.prg.p.uops), f"{[uop.uop for uop in ji.prg.p.uops]} does not contain bitcast"

 class TestLocalAccess(unittest.TestCase):
  @unittest.skipIf(Device.DEFAULT in {"LLVM"}, "device doesn't support local memory")
--- a/test/test_uops_stats.py
+++ b/test/test_uops_stats.py
@@ -14,7 +14,7 @@ from tinygrad.engine.realize import lower_schedule_item
 def get_stats(x:Tensor):
  si = create_schedule([x.lazydata])[-1]
  ei = lower_schedule_item(si)
-  return ei.prg.p.op_estimate, ei.prg.p.mem_estimate
+  return ei.prg.op_estimate, ei.prg.mem_estimate

 class TestUOpsStats(unittest.TestCase):
  def test_simple_add(self):
--- a/tinygrad/codegen/kernel.py
+++ b/tinygrad/codegen/kernel.py
@@ -59,9 +59,6 @@ tensor_cores: Dict[str, List[TensorCore]] = {
  "HSA": [TensorCore(dims=(16,16,16), threads=[(0,8),(0,2),(1,2)], thread_local_sizes=[[16],[16],[4,2]], thread_local_aliases=[ [[0],[0],[2],[-1],[1]], [[1],[2],[0],[-1],[0]], [[1],[2],[-2],[0],[3,-1]] ], dtype_in=di, dtype_out=do) for (di, do) in [(dtypes.half, dtypes.float), (dtypes.half, dtypes.half)]],  # noqa: E501
  "CUDA": [TensorCore(dims=(8,16,16), threads=[(0,2),(0,2),(1,2),(1,2),(0,2)], thread_local_sizes=[[2,2,2],[2,2],[2,2]], thread_local_aliases=[ [[0],[0],[5],[-2],[0],[-1,1,2,-3],[3,4]], [[3],[4],[0],[0],[5],[-1,1,2,-2],[0]], [[-1],[1],[5],[-2],[2],[0],[3,4]] ], dtype_in=di, dtype_out=do) for (di, do) in ([(dtypes.half, dtypes.float)] if getenv("PTX") else [(dtypes.half, dtypes.float), (dtypes.bfloat16, dtypes.float)])],  # noqa: E501
 }
-tensor_cores["AMD"] = tensor_cores["HSA"]
-tensor_cores["RHIP"] = tensor_cores["HSA"]
-tensor_cores["NV"] = tensor_cores["CUDA"]

 class LocalBuffer(NamedTuple):
  name: str
--- a/tinygrad/device.py
+++ b/tinygrad/device.py
@@ -1,17 +1,12 @@
 from __future__ import annotations
 import multiprocessing
-from dataclasses import dataclass, replace
+from dataclasses import dataclass
 from collections import defaultdict
-from typing import TYPE_CHECKING, List, Optional, Dict, Tuple, Any
+from typing import List, Optional, Dict, Tuple, Any
 import importlib, inspect, functools, pathlib, os, ctypes
-from tinygrad.helpers import getenv, all_int, diskcache_get, diskcache_put, DEBUG,BEAM,NOOPT, GlobalCounters, flat_mv, from_mv
-from tinygrad.shape.symbolic import Variable, sint
+from tinygrad.helpers import getenv, diskcache_get, diskcache_put, DEBUG, GlobalCounters, flat_mv, from_mv
 from tinygrad.dtype import DType, ImageDType
-from tinygrad.ops import LazyOp
-from tinygrad.renderer import Renderer, Program
-
-if TYPE_CHECKING:
-  from tinygrad.codegen.linearizer import Linearizer
+from tinygrad.renderer import Renderer

 # **************** Device ****************

@@ -167,18 +162,6 @@ class _MallocAllocator(LRUAllocator):

 MallocAllocator = _MallocAllocator()

-# **************** base Runner + helpers ****************
-
-class Runner:
-  def __init__(self, display_name:str, dname:str, op_estimate:sint=0, mem_estimate:sint=0):
-    self.first_run, self.display_name, self.dname, self.op_estimate, self.mem_estimate = True, display_name, dname, op_estimate, mem_estimate
-  @property
-  def device(self): return Device[self.dname]
-  def exec(self, rawbufs:List[Buffer], var_vals:Optional[Dict[Variable, int]]=None) -> Optional[float]:
-    return self(rawbufs, {} if var_vals is None else var_vals)
-  def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False) -> Optional[float]:
-    raise NotImplementedError("override this")
-
 # **************** for Compiled Devices ****************

 class Compiler:
@@ -190,79 +173,8 @@ class Compiler:
      if self.cachekey is not None: diskcache_put(self.cachekey, src, lib)
    return lib

-class CompiledRunner(Runner):
-  def __init__(self, p:Program, precompiled:Optional[bytes]=None):
-    if DEBUG >= 4: print(p.src)
-    self.p:Program = p
-    self.lib:bytes = precompiled if precompiled is not None else Device[p.dname].compiler.compile_cached(p.src)
-    self.clprg = Device[p.dname].runtime(p.function_name, self.lib)
-    super().__init__(p.name, p.dname, p.op_estimate, p.mem_estimate)
-
-  def __reduce__(self): return self.__class__, (self.p, self.lib)
-
-  def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False) -> Optional[float]:
-    global_size, local_size = self.p.launch_dims(var_vals)
-    if global_size is not None and local_size is None and all_int(self.p.global_size): # type: ignore[arg-type]
-      # TODO: this is copied from get_program
-      from tinygrad.features.search import optimize_local_size
-      local_size = optimize_local_size(self.clprg, global_size, rawbufs)
-      global_size = [g//l if g%l == 0 else g/l for g,l in zip(global_size, local_size)]
-      self.p = replace(self.p, global_size=global_size, local_size=local_size)
-    lra = {}
-    if global_size:
-      lra['global_size'] = global_size
-      assert len(global_size) == 3, "global size must have len 3"
-    if local_size:
-      lra['local_size'] = local_size
-      assert len(local_size) == 3, "local size must have len 3"
-    return self.clprg(*[x._buf for x in rawbufs], **lra, vals=tuple(var_vals[k] for k in self.p.vars), wait=wait)
-
-method_cache: Dict[Tuple[str, Tuple[LazyOp, ...], int, bool], CompiledRunner] = {}
-logkerns, logkerns_level = open(getenv("LOGKERNS", ""), "a") if getenv("LOGKERNS", "") else None, getenv("LOGKERNS_LEVEL", 1)
 class Compiled:
  def __init__(self, device:str, allocator:Allocator, renderer:Optional[Renderer], compiler:Optional[Compiler], runtime, graph=None):
    self.dname, self.allocator, self.compiler, self.runtime, self.graph = device, allocator, compiler if compiler else Compiler(), runtime, graph
    self.renderer = renderer if renderer else Renderer()
  def synchronize(self): pass  # override this in your device
-
-  def to_runner(self, k:Linearizer) -> CompiledRunner: return CompiledRunner(replace(k.to_program(), dname=self.dname))
-
-  def get_linearizer(self, *ast:LazyOp) -> Linearizer:
-    if DEBUG >= 3:
-      from tinygrad.features.graph import print_tree
-      for op in ast: print_tree(op)
-    from tinygrad.codegen.linearizer import Linearizer
-    k = Linearizer(*ast, opts=self.renderer)
-    k.required_optimizations()
-    if not NOOPT:
-      if not (used_tensor_cores:=k.apply_tensor_cores(getenv("TC", 1))): k.hand_coded_optimizations()
-      if BEAM >= 1:
-        from tinygrad.features.search import beam_search, time_linearizer, bufs_from_lin
-        kb, k_opt = Linearizer(*ast, opts=self.renderer), k
-        kb.required_optimizations()
-        rawbufs = bufs_from_lin(kb, allocate=False)
-        k = beam_search(kb, rawbufs, BEAM.value, bool(getenv("BEAM_ESTIMATE", 1)))
-        if getenv("BEAM_COMPARE", 1):
-          # TODO: move the HC/TC/BEAM compare to beam_search so it can be optionally cached which choice is better
-          lins: List[Tuple[str, Linearizer]] = [(f"beam{BEAM.value}", k), (("tc" if used_tensor_cores else "hc"), k_opt)]
-          if used_tensor_cores:
-            lins.append(("hc", Linearizer(*ast, opts=self.renderer)))
-            lins[-1][1].hand_coded_optimizations()
-          timed = sorted([(nm, tk, time_linearizer(tk, rawbufs, allow_test_size=False, clear_l2=True)) for nm, tk in lins], key=lambda x: x[2])
-          if DEBUG >= 1: print("  <  ".join(f"{nm:6s} : {lin.colored_shape(30, dense=True)} : {tm*1e6:8.2f} us" for nm, lin, tm in timed))
-          k = timed[0][1]
-          if logkerns is not None and logkerns_level > 1: logkerns.writelines([f"{(lin.ast, lin.applied_opts)}\n" for (_,lin,_) in timed[1:]])
-    # TODO: check the correctness inline once compare_linearizer is in core
-    if logkerns is not None: logkerns.writelines([f"{(k.ast, k.applied_opts)}\n"])
-    if DEBUG >= 4: print((k.ast, k.applied_opts)) # print here to show final applied_opts for all kernels instead of just in beam_search
-    return k
-
-  def get_runner(self, *ast:LazyOp) -> CompiledRunner:
-    ckey = (self.dname, ast, BEAM.value, False)
-    if cret:=method_cache.get(ckey): return cret
-    bkey = (self.dname.split(":")[0], ast, BEAM.value, True)
-    if bret:=method_cache.get(bkey):
-      method_cache[ckey] = ret = CompiledRunner(replace(bret.p, dname=self.dname), bret.lib)
-    else:
-      method_cache[ckey] = method_cache[bkey] = ret = self.to_runner(self.get_linearizer(*ast))
-    return ret
--- a/tinygrad/engine/jit.py
+++ b/tinygrad/engine/jit.py
@@ -4,11 +4,11 @@ import functools, itertools, collections
 from tinygrad.tensor import Tensor
 from tinygrad.lazy import LazyBuffer
 from tinygrad.helpers import flatten, merge_dicts, DEBUG, Context, GRAPH, BEAM, getenv, all_int, GraphException, colored, JIT
-from tinygrad.device import Buffer, CompiledRunner, Compiled, Device, Runner
+from tinygrad.device import Buffer, Compiled, Device
 from tinygrad.dtype import DType
 from tinygrad.shape.shapetracker import ShapeTracker
 from tinygrad.shape.symbolic import Variable, sint
-from tinygrad.engine.realize import ExecItem, capturing, EmptyOp, ViewOp, BufferXfer
+from tinygrad.engine.realize import ExecItem, capturing, EmptyOp, ViewOp, BufferXfer, CompiledRunner, Runner
 from tinygrad.engine.memory import _internal_memory_planner
 from tinygrad.nn.state import get_parameters
 from weakref import WeakKeyDictionary
--- a/tinygrad/engine/realize.py
+++ b/tinygrad/engine/realize.py
@@ -1,14 +1,84 @@
-from typing import List, Dict, Optional, cast, Generator
+from typing import List, Dict, Optional, cast, Generator, Tuple
 import time
-from dataclasses import dataclass
-from tinygrad.helpers import colored, getenv, DEBUG, GlobalCounters, ansilen
-from tinygrad.ops import ScheduleItem, BufferOps, LoadOps
-from tinygrad.device import Runner, Device
-from tinygrad.device import Buffer
-from tinygrad.shape.symbolic import Variable, sym_infer
+from dataclasses import dataclass, replace
+from tinygrad.helpers import colored, getenv, DEBUG, GlobalCounters, ansilen, BEAM, NOOPT, all_int
+from tinygrad.ops import ScheduleItem, BufferOps, LoadOps, LazyOp
+from tinygrad.device import Device, Buffer
+from tinygrad.shape.symbolic import Variable, sym_infer, sint
+from tinygrad.renderer import Renderer, Program
+from tinygrad.codegen.linearizer import Linearizer
+
+# **************** Program Creation ****************
+
+logkerns, logkerns_level = open(getenv("LOGKERNS", ""), "a") if getenv("LOGKERNS", "") else None, getenv("LOGKERNS_LEVEL", 1)
+def get_linearizer(renderer:Renderer, ast:Tuple[LazyOp, ...]) -> Linearizer:
+  if DEBUG >= 3:
+    from tinygrad.features.graph import print_tree
+    for op in ast: print_tree(op)
+  k = Linearizer(*ast, opts=renderer)
+  k.required_optimizations()
+  if not NOOPT:
+    if not (used_tensor_cores:=k.apply_tensor_cores(getenv("TC", 1))): k.hand_coded_optimizations()
+    if BEAM >= 1:
+      from tinygrad.features.search import beam_search, time_linearizer, bufs_from_lin
+      kb, k_opt = Linearizer(*ast, opts=renderer), k
+      kb.required_optimizations()
+      rawbufs = bufs_from_lin(kb, allocate=False)
+      k = beam_search(kb, rawbufs, BEAM.value, bool(getenv("BEAM_ESTIMATE", 1)))
+      if getenv("BEAM_COMPARE", 1):
+        # TODO: move the HC/TC/BEAM compare to beam_search so it can be optionally cached which choice is better
+        lins: List[Tuple[str, Linearizer]] = [(f"beam{BEAM.value}", k), (("tc" if used_tensor_cores else "hc"), k_opt)]
+        if used_tensor_cores:
+          lins.append(("hc", Linearizer(*ast, opts=renderer)))
+          lins[-1][1].hand_coded_optimizations()
+        timed = sorted([(nm, tk, time_linearizer(tk, rawbufs, allow_test_size=False, clear_l2=True)) for nm, tk in lins], key=lambda x: x[2])
+        if DEBUG >= 1: print("  <  ".join(f"{nm:6s} : {lin.colored_shape(30, dense=True)} : {tm*1e6:8.2f} us" for nm, lin, tm in timed))
+        k = timed[0][1]
+        if logkerns is not None and logkerns_level > 1: logkerns.writelines([f"{(lin.ast, lin.applied_opts)}\n" for (_,lin,_) in timed[1:]])
+  # TODO: check the correctness inline once compare_linearizer is in core
+  if logkerns is not None: logkerns.writelines([f"{(k.ast, k.applied_opts)}\n"])
+  if DEBUG >= 4: print((k.ast, k.applied_opts)) # print here to show final applied_opts for all kernels instead of just in beam_search
+  return k

 # **************** Runners ****************

+class Runner:
+  def __init__(self, display_name:str, dname:str, op_estimate:sint=0, mem_estimate:sint=0):
+    self.first_run, self.display_name, self.dname, self.op_estimate, self.mem_estimate = True, display_name, dname, op_estimate, mem_estimate
+  @property
+  def device(self): return Device[self.dname]
+  def exec(self, rawbufs:List[Buffer], var_vals:Optional[Dict[Variable, int]]=None) -> Optional[float]:
+    return self(rawbufs, {} if var_vals is None else var_vals)
+  def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False) -> Optional[float]:
+    raise NotImplementedError("override this")
+
+class CompiledRunner(Runner):
+  def __init__(self, p:Program, precompiled:Optional[bytes]=None):
+    if DEBUG >= 4: print(p.src)
+    self.p:Program = p
+    self.lib:bytes = precompiled if precompiled is not None else Device[p.dname].compiler.compile_cached(p.src)
+    self.clprg = Device[p.dname].runtime(p.function_name, self.lib)
+    super().__init__(p.name, p.dname, p.op_estimate, p.mem_estimate)
+
+  def __reduce__(self): return self.__class__, (self.p, self.lib)
+
+  def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False) -> Optional[float]:
+    global_size, local_size = self.p.launch_dims(var_vals)
+    if global_size is not None and local_size is None and all_int(self.p.global_size): # type: ignore[arg-type]
+      # TODO: this is copied from get_program
+      from tinygrad.features.search import optimize_local_size
+      local_size = optimize_local_size(self.clprg, global_size, rawbufs)
+      global_size = [g//l if g%l == 0 else g/l for g,l in zip(global_size, local_size)]
+      self.p = replace(self.p, global_size=global_size, local_size=local_size)
+    lra = {}
+    if global_size:
+      lra['global_size'] = global_size
+      assert len(global_size) == 3, "global size must have len 3"
+    if local_size:
+      lra['local_size'] = local_size
+      assert len(local_size) == 3, "local size must have len 3"
+    return self.clprg(*[x._buf for x in rawbufs], **lra, vals=tuple(var_vals[k] for k in self.p.vars), wait=wait)
+
 class CustomOp(Runner):
  def __init__(self, fxn):
    self.fxn = fxn
@@ -53,6 +123,20 @@ class BufferXfer(BufferCopy):
      src.allocator.track_cross_device.add(dest.allocator.device)
    dest.allocator.transfer(dest._buf, src._buf, dest.nbytes, src_dev=src.allocator.device, dest_dev=dest.allocator.device)

+# **************** method cache ****************
+
+method_cache: Dict[Tuple[str, Tuple[LazyOp, ...], int, bool], CompiledRunner] = {}
+def get_runner(dname:str, ast:Tuple[LazyOp, ...]) -> CompiledRunner:
+  ckey = (dname, ast, BEAM.value, False)
+  if cret:=method_cache.get(ckey): return cret
+  bkey = (dname.split(":")[0], ast, BEAM.value, True)
+  if bret:=method_cache.get(bkey):
+    method_cache[ckey] = ret = CompiledRunner(replace(bret.p, dname=dname), bret.lib)
+  else:
+    prg: Program = get_linearizer(Device[dname].renderer, ast).to_program()
+    method_cache[ckey] = method_cache[bkey] = ret = CompiledRunner(replace(prg, dname=dname))
+  return ret
+
 # **************** lowering functions ****************

@dataclass(frozen=True)
@@ -77,7 +161,7 @@ class ExecItem:
 def lower_schedule_item(si:ScheduleItem) -> ExecItem:
  assert len(set(x.device for x in si.bufs)) == 1 or si.ast[0].op is LoadOps.COPY or getenv("USE_COPY_KERNEL")
  if si.ast[0].op is BufferOps.STORE:
-    runner = Device[si.outputs[0].device].get_runner(*si.ast)
+    runner = get_runner(si.outputs[0].device, si.ast)
    return ExecItem(runner, [si.bufs[x[0]] for x in runner.p.globals])
  assert len(si.ast) == 1 and len(si.outputs) == 1, "only ASTRunner supports multioutput"
  out, ast = si.outputs[0], si.ast[0]
--- a/tinygrad/features/search.py
+++ b/tinygrad/features/search.py
@@ -2,7 +2,7 @@ from typing import Dict, List, cast, DefaultDict, Optional, Tuple, Callable
 import itertools, functools, random, math, time, multiprocessing, traceback, signal
 from collections import defaultdict
 from dataclasses import replace
-from tinygrad.device import Device, Buffer, CompiledRunner, Compiler, Program
+from tinygrad.device import Device, Buffer, Compiler
 from tinygrad.ops import MemBuffer
 from tinygrad.helpers import prod, flatten, DEBUG, CACHELEVEL, diskcache_get, diskcache_put, getenv, Context, colored, to_function_name
 from tinygrad.dtype import ImageDType
@@ -11,6 +11,8 @@ from tinygrad.codegen.kernel import Opt, OptOps, KernelOptError
 from tinygrad.codegen.uops import UOpGraph
 from tinygrad.tensor import Tensor
 from tinygrad.shape.symbolic import sym_infer
+from tinygrad.engine.realize import CompiledRunner
+from tinygrad.renderer import Program

 actions = [Opt(op=OptOps.UPCAST, axis=axis, amt=amt) for amt in [0,2,3,4,5,7] for axis in range(6)]
 actions += [Opt(op=OptOps.UNROLL, axis=axis, amt=amt) for amt in [0,4,7] for axis in range(4)]
--- a/tinygrad/runtime/graph/clang.py
+++ b/tinygrad/runtime/graph/clang.py
@@ -2,8 +2,8 @@ from typing import List, Dict, cast
 import ctypes
 from tinygrad.helpers import dedup, cpu_time_execution, GraphException, DEBUG
 from tinygrad.engine.jit import GraphRunner
-from tinygrad.device import Buffer, Device, CompiledRunner
-from tinygrad.engine.realize import ExecItem
+from tinygrad.device import Buffer, Device
+from tinygrad.engine.realize import ExecItem, CompiledRunner
 from tinygrad.shape.symbolic import Variable
 from tinygrad.runtime.ops_clang import ClangProgram
 from tinygrad.renderer.cstyle import ClangRenderer
--- a/tinygrad/runtime/graph/cuda.py
+++ b/tinygrad/runtime/graph/cuda.py
@@ -2,10 +2,10 @@ import ctypes
 from typing import Any, Optional, Tuple, Dict, List, cast
 import tinygrad.runtime.autogen.cuda as cuda
 from tinygrad.helpers import init_c_var, GraphException
-from tinygrad.device import CompiledRunner, Buffer, Device
+from tinygrad.device import Buffer, Device
 from tinygrad.runtime.ops_cuda import CUDADevice, check, encode_args, cu_time_execution
 from tinygrad.shape.symbolic import Variable
-from tinygrad.engine.realize import ExecItem, BufferXfer
+from tinygrad.engine.realize import ExecItem, BufferXfer, CompiledRunner
 from tinygrad.engine.jit import MultiGraphRunner

 class CUDAGraph(MultiGraphRunner):
--- a/tinygrad/runtime/graph/hcq.py
+++ b/tinygrad/runtime/graph/hcq.py
@@ -2,9 +2,9 @@ import ctypes, collections, array, time
 from typing import List, Any, Dict, cast, Optional, Tuple, Set
 from tinygrad.helpers import GraphException, round_up, to_mv
 from tinygrad.device import Buffer, BufferOptions
-from tinygrad.device import Compiled, CompiledRunner, Device
+from tinygrad.device import Compiled, Device
 from tinygrad.shape.symbolic import Variable
-from tinygrad.engine.realize import ExecItem, BufferXfer
+from tinygrad.engine.realize import ExecItem, BufferXfer, CompiledRunner
 from tinygrad.engine.jit import MultiGraphRunner

 class HCQGraph(MultiGraphRunner):
--- a/tinygrad/runtime/graph/hsa.py
+++ b/tinygrad/runtime/graph/hsa.py
@@ -2,10 +2,10 @@ import ctypes, collections, time, itertools
 from typing import List, Any, Dict, cast, Optional, Tuple
 from tinygrad.helpers import GraphException, init_c_var, round_up
 from tinygrad.device import Buffer, BufferOptions
-from tinygrad.device import Compiled, CompiledRunner, Device
+from tinygrad.device import Compiled, Device
 from tinygrad.shape.symbolic import Variable
 from tinygrad.runtime.ops_hsa import HSADevice, PROFILE, Profiler
-from tinygrad.engine.realize import ExecItem, BufferXfer
+from tinygrad.engine.realize import ExecItem, BufferXfer, CompiledRunner
 from tinygrad.engine.jit import MultiGraphRunner
 import tinygrad.runtime.autogen.hsa as hsa
 from tinygrad.runtime.driver.hsa import check, AQLQueue, AQL_PACKET_SIZE, EMPTY_SIGNAL
--- a/tinygrad/runtime/graph/metal.py
+++ b/tinygrad/runtime/graph/metal.py
@@ -2,8 +2,8 @@ from typing import List, Any, Dict, cast, Optional
 import Metal
 from tinygrad.dtype import dtypes
 from tinygrad.helpers import dedup, unwrap2, GraphException
-from tinygrad.device import Buffer, CompiledRunner
-from tinygrad.engine.realize import ExecItem
+from tinygrad.device import Buffer
+from tinygrad.engine.realize import ExecItem, CompiledRunner
 from tinygrad.engine.jit import GraphRunner
 from tinygrad.shape.symbolic import Variable
 from tinygrad.runtime.ops_metal import wait_check