diff --git a/docs-legacy/abstractions2.py b/docs-legacy/abstractions2.py index de0b7933ee..9b95e787a4 100644 --- a/docs-legacy/abstractions2.py +++ b/docs-legacy/abstractions2.py @@ -55,11 +55,12 @@ alu = LazyOp(BinaryOps.ADD, (ld_1, ld_2)) st_0 = LazyOp(BufferOps.STORE, (alu,), MemBuffer(0, dtypes.int32, ShapeTracker.from_shape((1,)))) # convert the computation to a "linearized" format (print the format) -lin = Device[DEVICE].get_linearizer(st_0).linearize() +from tinygrad.engine.realize import get_linearizer, CompiledRunner +lin = get_linearizer(Device[DEVICE].renderer, (st_0,)).linearize() for u in lin.uops: print(u) # compile a program (and print the source) -fxn = Device[DEVICE].to_runner(lin) +fxn = CompiledRunner(lin.to_program()) print(fxn.p.src) # NOTE: fxn.clprg is the ClangProgram diff --git a/extra/export_model.py b/extra/export_model.py index e62cc0c889..4c37e92794 100644 --- a/extra/export_model.py +++ b/extra/export_model.py @@ -1,6 +1,6 @@ from typing import Tuple, Dict, List from tinygrad.dtype import DType -from tinygrad.device import Program +from tinygrad.renderer import Program from tinygrad.tensor import Device, Tensor from tinygrad.engine.jit import TinyJit from tinygrad.nn.state import get_state_dict diff --git a/openpilot/compile2.py b/openpilot/compile2.py index 8a7a730ac7..2ad2ad04a8 100644 --- a/openpilot/compile2.py +++ b/openpilot/compile2.py @@ -15,9 +15,9 @@ from typing import Tuple, List, Optional, Dict, cast from extra.onnx import get_run_onnx from tinygrad import Tensor, Device, GlobalCounters, dtypes from tinygrad.dtype import ImageDType -from tinygrad.device import CompiledRunner, Buffer +from tinygrad.device import Buffer from tinygrad.helpers import partition, Context, fetch, getenv, DEBUG -from tinygrad.engine.realize import run_schedule, lower_schedule, ExecItem +from tinygrad.engine.realize import run_schedule, lower_schedule, ExecItem, CompiledRunner from tinygrad.engine.memory import memory_planner from tinygrad.engine.schedule import create_schedule from tinygrad.ops import LoadOps, ScheduleItem diff --git a/test/external/external_test_speed_llama.py b/test/external/external_test_speed_llama.py index 3939bbb650..75bd8d69ab 100644 --- a/test/external/external_test_speed_llama.py +++ b/test/external/external_test_speed_llama.py @@ -4,7 +4,8 @@ from examples.llama import Transformer, MODEL_PARAMS from tinygrad.tensor import Tensor from tinygrad import Device from tinygrad.nn.state import get_state_dict -from tinygrad.device import Allocator, method_cache +from tinygrad.device import Allocator +from tinygrad.engine.realize import method_cache from tinygrad.helpers import Profiling class FakeProgram: diff --git a/test/external/fuzz_linearizer.py b/test/external/fuzz_linearizer.py index bffaf7dd1d..4c19807350 100644 --- a/test/external/fuzz_linearizer.py +++ b/test/external/fuzz_linearizer.py @@ -9,6 +9,7 @@ from tinygrad.codegen.linearizer import Linearizer, UOp from tinygrad.codegen.kernel import Opt, OptOps from tinygrad.features.search import get_linearizer_actions, bufs_from_lin from tinygrad.features.graph import print_tree +from tinygrad.engine.realize import CompiledRunner from tinygrad.helpers import getenv, from_mv, prod, colored, Context, DEBUG from tinygrad.ops import LazyOp, UnaryOps, BufferOps @@ -55,7 +56,7 @@ def run_linearizer(lin: Linearizer, rawbufs=None, var_vals=None): # TODO: images needs required_optimization try: - prg = device.to_runner(lin) + prg = CompiledRunner(lin.to_program()) except Exception: traceback.print_exc() return "COMPILE_ERROR" diff --git a/test/helpers.py b/test/helpers.py index be2472b10d..71ace0c766 100644 --- a/test/helpers.py +++ b/test/helpers.py @@ -1,7 +1,7 @@ import sys import numpy as np from tinygrad import Tensor, Device, dtypes -from tinygrad.device import Runner +from tinygrad.engine.realize import Runner from tinygrad.dtype import DType from tinygrad.nn.state import get_parameters from tinygrad.helpers import Context, CI, OSX, getenv diff --git a/test/test_custom_function.py b/test/test_custom_function.py index 41cb243f50..abb4de0d95 100644 --- a/test/test_custom_function.py +++ b/test/test_custom_function.py @@ -10,8 +10,10 @@ from tinygrad.dtype import dtypes # *** first, we implement the atan2 op at the lowest level *** # `atan2_gpu` for GPUBuffers and `atan2_cpu` for CPUBuffers from tinygrad.lazy import Buffer, create_lazybuffer -from tinygrad.device import CompiledRunner, Device, Program +from tinygrad.device import Device from tinygrad.shape.shapetracker import ShapeTracker +from tinygrad.engine.realize import CompiledRunner +from tinygrad.renderer import Program # we don't always have GPU support, so the type signature is the abstract CompiledBuffer instead of GPUBuffer def atan2_gpu(ret:Buffer, a:Buffer, b:Buffer): diff --git a/test/test_linearizer.py b/test/test_linearizer.py index 7daf50b6f2..d1ceb631c4 100644 --- a/test/test_linearizer.py +++ b/test/test_linearizer.py @@ -1,5 +1,6 @@ import numpy as np import unittest +from dataclasses import replace from tinygrad.codegen.kernel import Opt, OptOps, KernelOptError, tensor_cores from tinygrad.codegen.linearizer import Linearizer, UOp, UOps, expand_node, expand_idxs @@ -10,7 +11,7 @@ from tinygrad.shape.view import View from tinygrad.shape.symbolic import MulNode, Variable, NumNode, Node from tinygrad.tensor import Tensor from tinygrad.engine.schedule import create_schedule -from tinygrad.engine.realize import run_schedule, lower_schedule +from tinygrad.engine.realize import run_schedule, lower_schedule, CompiledRunner from tinygrad.helpers import prod, Context, getenv, CI from tinygrad.dtype import DType, dtypes from tinygrad.codegen.uops import UOpGraph @@ -269,7 +270,7 @@ class TestLinearizer(unittest.TestCase): assert len([uop for uop in k.uops if uop.uop is UOps.WMMA]) > 0, "tensor core not triggered" assert len([x for x in k.applied_opts if x.op is OptOps.TC]) == 1, "tensor core opt not included" - prg = Device[Device.DEFAULT].to_runner(k) + prg = CompiledRunner(k.to_program()) real_bufs[0].copyin(np.zeros((real_bufs[0].size, ), dtype=real_bufs[0].dtype.np).data) # Zero to check that all values are filled prg.exec(real_bufs) result = np.frombuffer(real_bufs[0].as_buffer(), real_bufs[0].dtype.np) @@ -586,7 +587,9 @@ def helper_linearizer_opt(r:Tensor, opts=[], apply_tc=False, atol=1e-4, rtol=1e- wanna_output = None realized_ast, real_bufs = helper_realized_ast(r) - def check_opt(opts, create_k, to_prg, expected_color_size): + def get_prg(k:Linearizer): return CompiledRunner(replace(k.to_program(), dname=Device.DEFAULT)) + + def check_opt(opts, create_k, expected_color_size): k = create_k() if apply_tc: assert k.apply_tensor_cores(1, extra_opts=opts), "no tensor core triggered" @@ -595,26 +598,26 @@ def helper_linearizer_opt(r:Tensor, opts=[], apply_tc=False, atol=1e-4, rtol=1e- k.apply_opt(opt) if expected_color_size is not None: assert (cs:=[(x,y) for x,y in zip(k.colors(), k.full_shape)]) == expected_color_size, f"expected={expected_color_size} got={cs}" - prg = to_prg(k) + prg = get_prg(k) real_bufs[0].copyin(np.zeros((real_bufs[0].size, ), dtype=real_bufs[0].dtype.np).data) # Zero to check that all values are filled prg.exec(real_bufs) np.testing.assert_allclose(np.frombuffer(real_bufs[0].as_buffer(), real_bufs[0].dtype.np), wanna_output, atol=atol, rtol=rtol) # Get baseline, which is not optimized at all. k = Linearizer(realized_ast) - prg = Device[Device.DEFAULT].to_runner(k) + prg = get_prg(k) prg.exec(real_bufs) wanna_output = np.frombuffer(real_bufs[0].as_buffer(), real_bufs[0].dtype.np).copy() # Check correctness of handcoded optimiztions. k = Linearizer(realized_ast) k.hand_coded_optimizations() - prg = Device[Device.DEFAULT].to_runner(k) + prg = get_prg(k) real_bufs[0].copyin(np.zeros((real_bufs[0].size, ), dtype=real_bufs[0].dtype.np).data) # Zero to check that all values are filled prg.exec(real_bufs) np.testing.assert_allclose(wanna_output, np.frombuffer(real_bufs[0].as_buffer(), real_bufs[0].dtype.np), atol=atol, rtol=rtol) for i, x in enumerate(opts): # Check custom transformations if any. - check_opt(x, lambda: Linearizer(realized_ast), Device[Device.DEFAULT].to_runner, color_sizes[i] if i < len(color_sizes) else None) + check_opt(x, lambda: Linearizer(realized_ast), color_sizes[i] if i < len(color_sizes) else None) class TestKernelOpts(unittest.TestCase): def test_local_and_grouped_reduce(self): diff --git a/test/test_multitensor.py b/test/test_multitensor.py index 3a60f0f30a..6cc9b99e7a 100644 --- a/test/test_multitensor.py +++ b/test/test_multitensor.py @@ -1,12 +1,11 @@ import unittest, functools, random from typing import List from tinygrad import Tensor, Device, nn, GlobalCounters, TinyJit, dtypes -from tinygrad.device import CompiledRunner from tinygrad.ops import LoadOps, ReduceOps from tinygrad.helpers import CI, prod, Context from tinygrad.nn.state import get_parameters, get_state_dict from tinygrad.engine.schedule import create_schedule -from tinygrad.engine.realize import lower_schedule, BufferCopy +from tinygrad.engine.realize import lower_schedule, BufferCopy, CompiledRunner from tinygrad.features.multi import all_reduce, MultiLazyBuffer from random import randint import numpy as np diff --git a/test/test_uops.py b/test/test_uops.py index b7ee7d3f75..7f49f4bab2 100644 --- a/test/test_uops.py +++ b/test/test_uops.py @@ -4,9 +4,11 @@ import numpy as np from tinygrad.tensor import Tensor from tinygrad.helpers import getenv from tinygrad.dtype import dtypes, DType, PtrDType -from tinygrad.device import Buffer, Device, CompiledRunner, Program +from tinygrad.device import Buffer, Device from tinygrad.ops import UnaryOps, BinaryOps, TernaryOps +from tinygrad.renderer import Program from tinygrad.engine.schedule import create_schedule +from tinygrad.engine.realize import CompiledRunner, lower_schedule_item from tinygrad.codegen.linearizer import UOps, UOp from tinygrad.codegen.uops import exec_alu, UOpGraph from test.helpers import is_dtype_supported @@ -210,9 +212,8 @@ class TestConstantFolding(unittest.TestCase): t = Tensor(1, dtype=dtypes.float).bitcast(dtypes.int) si = create_schedule([t.lazydata]) assert len(si) == 1 - si = si[0] - lin = Device[Device.DEFAULT].get_linearizer(si.ast[0]).linearize() - assert any(uop.uop is UOps.BITCAST for uop in lin.uops.uops), f"{[uop.uop for uop in lin.uops.uops]} does not contain bitcast" + ji = lower_schedule_item(si[-1]) + assert any(uop.uop is UOps.BITCAST for uop in ji.prg.p.uops), f"{[uop.uop for uop in ji.prg.p.uops]} does not contain bitcast" class TestLocalAccess(unittest.TestCase): @unittest.skipIf(Device.DEFAULT in {"LLVM"}, "device doesn't support local memory") diff --git a/test/test_uops_stats.py b/test/test_uops_stats.py index 29b968e90e..e65d133a58 100644 --- a/test/test_uops_stats.py +++ b/test/test_uops_stats.py @@ -14,7 +14,7 @@ from tinygrad.engine.realize import lower_schedule_item def get_stats(x:Tensor): si = create_schedule([x.lazydata])[-1] ei = lower_schedule_item(si) - return ei.prg.p.op_estimate, ei.prg.p.mem_estimate + return ei.prg.op_estimate, ei.prg.mem_estimate class TestUOpsStats(unittest.TestCase): def test_simple_add(self): diff --git a/tinygrad/codegen/kernel.py b/tinygrad/codegen/kernel.py index 72e7393bdf..f8ebefe8d9 100644 --- a/tinygrad/codegen/kernel.py +++ b/tinygrad/codegen/kernel.py @@ -59,9 +59,6 @@ tensor_cores: Dict[str, List[TensorCore]] = { "HSA": [TensorCore(dims=(16,16,16), threads=[(0,8),(0,2),(1,2)], thread_local_sizes=[[16],[16],[4,2]], thread_local_aliases=[ [[0],[0],[2],[-1],[1]], [[1],[2],[0],[-1],[0]], [[1],[2],[-2],[0],[3,-1]] ], dtype_in=di, dtype_out=do) for (di, do) in [(dtypes.half, dtypes.float), (dtypes.half, dtypes.half)]], # noqa: E501 "CUDA": [TensorCore(dims=(8,16,16), threads=[(0,2),(0,2),(1,2),(1,2),(0,2)], thread_local_sizes=[[2,2,2],[2,2],[2,2]], thread_local_aliases=[ [[0],[0],[5],[-2],[0],[-1,1,2,-3],[3,4]], [[3],[4],[0],[0],[5],[-1,1,2,-2],[0]], [[-1],[1],[5],[-2],[2],[0],[3,4]] ], dtype_in=di, dtype_out=do) for (di, do) in ([(dtypes.half, dtypes.float)] if getenv("PTX") else [(dtypes.half, dtypes.float), (dtypes.bfloat16, dtypes.float)])], # noqa: E501 } -tensor_cores["AMD"] = tensor_cores["HSA"] -tensor_cores["RHIP"] = tensor_cores["HSA"] -tensor_cores["NV"] = tensor_cores["CUDA"] class LocalBuffer(NamedTuple): name: str diff --git a/tinygrad/device.py b/tinygrad/device.py index c030e7f5c9..1ac629e412 100644 --- a/tinygrad/device.py +++ b/tinygrad/device.py @@ -1,17 +1,12 @@ from __future__ import annotations import multiprocessing -from dataclasses import dataclass, replace +from dataclasses import dataclass from collections import defaultdict -from typing import TYPE_CHECKING, List, Optional, Dict, Tuple, Any +from typing import List, Optional, Dict, Tuple, Any import importlib, inspect, functools, pathlib, os, ctypes -from tinygrad.helpers import getenv, all_int, diskcache_get, diskcache_put, DEBUG,BEAM,NOOPT, GlobalCounters, flat_mv, from_mv -from tinygrad.shape.symbolic import Variable, sint +from tinygrad.helpers import getenv, diskcache_get, diskcache_put, DEBUG, GlobalCounters, flat_mv, from_mv from tinygrad.dtype import DType, ImageDType -from tinygrad.ops import LazyOp -from tinygrad.renderer import Renderer, Program - -if TYPE_CHECKING: - from tinygrad.codegen.linearizer import Linearizer +from tinygrad.renderer import Renderer # **************** Device **************** @@ -167,18 +162,6 @@ class _MallocAllocator(LRUAllocator): MallocAllocator = _MallocAllocator() -# **************** base Runner + helpers **************** - -class Runner: - def __init__(self, display_name:str, dname:str, op_estimate:sint=0, mem_estimate:sint=0): - self.first_run, self.display_name, self.dname, self.op_estimate, self.mem_estimate = True, display_name, dname, op_estimate, mem_estimate - @property - def device(self): return Device[self.dname] - def exec(self, rawbufs:List[Buffer], var_vals:Optional[Dict[Variable, int]]=None) -> Optional[float]: - return self(rawbufs, {} if var_vals is None else var_vals) - def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False) -> Optional[float]: - raise NotImplementedError("override this") - # **************** for Compiled Devices **************** class Compiler: @@ -190,79 +173,8 @@ class Compiler: if self.cachekey is not None: diskcache_put(self.cachekey, src, lib) return lib -class CompiledRunner(Runner): - def __init__(self, p:Program, precompiled:Optional[bytes]=None): - if DEBUG >= 4: print(p.src) - self.p:Program = p - self.lib:bytes = precompiled if precompiled is not None else Device[p.dname].compiler.compile_cached(p.src) - self.clprg = Device[p.dname].runtime(p.function_name, self.lib) - super().__init__(p.name, p.dname, p.op_estimate, p.mem_estimate) - - def __reduce__(self): return self.__class__, (self.p, self.lib) - - def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False) -> Optional[float]: - global_size, local_size = self.p.launch_dims(var_vals) - if global_size is not None and local_size is None and all_int(self.p.global_size): # type: ignore[arg-type] - # TODO: this is copied from get_program - from tinygrad.features.search import optimize_local_size - local_size = optimize_local_size(self.clprg, global_size, rawbufs) - global_size = [g//l if g%l == 0 else g/l for g,l in zip(global_size, local_size)] - self.p = replace(self.p, global_size=global_size, local_size=local_size) - lra = {} - if global_size: - lra['global_size'] = global_size - assert len(global_size) == 3, "global size must have len 3" - if local_size: - lra['local_size'] = local_size - assert len(local_size) == 3, "local size must have len 3" - return self.clprg(*[x._buf for x in rawbufs], **lra, vals=tuple(var_vals[k] for k in self.p.vars), wait=wait) - -method_cache: Dict[Tuple[str, Tuple[LazyOp, ...], int, bool], CompiledRunner] = {} -logkerns, logkerns_level = open(getenv("LOGKERNS", ""), "a") if getenv("LOGKERNS", "") else None, getenv("LOGKERNS_LEVEL", 1) class Compiled: def __init__(self, device:str, allocator:Allocator, renderer:Optional[Renderer], compiler:Optional[Compiler], runtime, graph=None): self.dname, self.allocator, self.compiler, self.runtime, self.graph = device, allocator, compiler if compiler else Compiler(), runtime, graph self.renderer = renderer if renderer else Renderer() def synchronize(self): pass # override this in your device - - def to_runner(self, k:Linearizer) -> CompiledRunner: return CompiledRunner(replace(k.to_program(), dname=self.dname)) - - def get_linearizer(self, *ast:LazyOp) -> Linearizer: - if DEBUG >= 3: - from tinygrad.features.graph import print_tree - for op in ast: print_tree(op) - from tinygrad.codegen.linearizer import Linearizer - k = Linearizer(*ast, opts=self.renderer) - k.required_optimizations() - if not NOOPT: - if not (used_tensor_cores:=k.apply_tensor_cores(getenv("TC", 1))): k.hand_coded_optimizations() - if BEAM >= 1: - from tinygrad.features.search import beam_search, time_linearizer, bufs_from_lin - kb, k_opt = Linearizer(*ast, opts=self.renderer), k - kb.required_optimizations() - rawbufs = bufs_from_lin(kb, allocate=False) - k = beam_search(kb, rawbufs, BEAM.value, bool(getenv("BEAM_ESTIMATE", 1))) - if getenv("BEAM_COMPARE", 1): - # TODO: move the HC/TC/BEAM compare to beam_search so it can be optionally cached which choice is better - lins: List[Tuple[str, Linearizer]] = [(f"beam{BEAM.value}", k), (("tc" if used_tensor_cores else "hc"), k_opt)] - if used_tensor_cores: - lins.append(("hc", Linearizer(*ast, opts=self.renderer))) - lins[-1][1].hand_coded_optimizations() - timed = sorted([(nm, tk, time_linearizer(tk, rawbufs, allow_test_size=False, clear_l2=True)) for nm, tk in lins], key=lambda x: x[2]) - if DEBUG >= 1: print(" < ".join(f"{nm:6s} : {lin.colored_shape(30, dense=True)} : {tm*1e6:8.2f} us" for nm, lin, tm in timed)) - k = timed[0][1] - if logkerns is not None and logkerns_level > 1: logkerns.writelines([f"{(lin.ast, lin.applied_opts)}\n" for (_,lin,_) in timed[1:]]) - # TODO: check the correctness inline once compare_linearizer is in core - if logkerns is not None: logkerns.writelines([f"{(k.ast, k.applied_opts)}\n"]) - if DEBUG >= 4: print((k.ast, k.applied_opts)) # print here to show final applied_opts for all kernels instead of just in beam_search - return k - - def get_runner(self, *ast:LazyOp) -> CompiledRunner: - ckey = (self.dname, ast, BEAM.value, False) - if cret:=method_cache.get(ckey): return cret - bkey = (self.dname.split(":")[0], ast, BEAM.value, True) - if bret:=method_cache.get(bkey): - method_cache[ckey] = ret = CompiledRunner(replace(bret.p, dname=self.dname), bret.lib) - else: - method_cache[ckey] = method_cache[bkey] = ret = self.to_runner(self.get_linearizer(*ast)) - return ret diff --git a/tinygrad/engine/jit.py b/tinygrad/engine/jit.py index b55e56a797..3b6d08eafb 100644 --- a/tinygrad/engine/jit.py +++ b/tinygrad/engine/jit.py @@ -4,11 +4,11 @@ import functools, itertools, collections from tinygrad.tensor import Tensor from tinygrad.lazy import LazyBuffer from tinygrad.helpers import flatten, merge_dicts, DEBUG, Context, GRAPH, BEAM, getenv, all_int, GraphException, colored, JIT -from tinygrad.device import Buffer, CompiledRunner, Compiled, Device, Runner +from tinygrad.device import Buffer, Compiled, Device from tinygrad.dtype import DType from tinygrad.shape.shapetracker import ShapeTracker from tinygrad.shape.symbolic import Variable, sint -from tinygrad.engine.realize import ExecItem, capturing, EmptyOp, ViewOp, BufferXfer +from tinygrad.engine.realize import ExecItem, capturing, EmptyOp, ViewOp, BufferXfer, CompiledRunner, Runner from tinygrad.engine.memory import _internal_memory_planner from tinygrad.nn.state import get_parameters from weakref import WeakKeyDictionary diff --git a/tinygrad/engine/realize.py b/tinygrad/engine/realize.py index aae187cef2..e9887f6e4d 100644 --- a/tinygrad/engine/realize.py +++ b/tinygrad/engine/realize.py @@ -1,14 +1,84 @@ -from typing import List, Dict, Optional, cast, Generator +from typing import List, Dict, Optional, cast, Generator, Tuple import time -from dataclasses import dataclass -from tinygrad.helpers import colored, getenv, DEBUG, GlobalCounters, ansilen -from tinygrad.ops import ScheduleItem, BufferOps, LoadOps -from tinygrad.device import Runner, Device -from tinygrad.device import Buffer -from tinygrad.shape.symbolic import Variable, sym_infer +from dataclasses import dataclass, replace +from tinygrad.helpers import colored, getenv, DEBUG, GlobalCounters, ansilen, BEAM, NOOPT, all_int +from tinygrad.ops import ScheduleItem, BufferOps, LoadOps, LazyOp +from tinygrad.device import Device, Buffer +from tinygrad.shape.symbolic import Variable, sym_infer, sint +from tinygrad.renderer import Renderer, Program +from tinygrad.codegen.linearizer import Linearizer + +# **************** Program Creation **************** + +logkerns, logkerns_level = open(getenv("LOGKERNS", ""), "a") if getenv("LOGKERNS", "") else None, getenv("LOGKERNS_LEVEL", 1) +def get_linearizer(renderer:Renderer, ast:Tuple[LazyOp, ...]) -> Linearizer: + if DEBUG >= 3: + from tinygrad.features.graph import print_tree + for op in ast: print_tree(op) + k = Linearizer(*ast, opts=renderer) + k.required_optimizations() + if not NOOPT: + if not (used_tensor_cores:=k.apply_tensor_cores(getenv("TC", 1))): k.hand_coded_optimizations() + if BEAM >= 1: + from tinygrad.features.search import beam_search, time_linearizer, bufs_from_lin + kb, k_opt = Linearizer(*ast, opts=renderer), k + kb.required_optimizations() + rawbufs = bufs_from_lin(kb, allocate=False) + k = beam_search(kb, rawbufs, BEAM.value, bool(getenv("BEAM_ESTIMATE", 1))) + if getenv("BEAM_COMPARE", 1): + # TODO: move the HC/TC/BEAM compare to beam_search so it can be optionally cached which choice is better + lins: List[Tuple[str, Linearizer]] = [(f"beam{BEAM.value}", k), (("tc" if used_tensor_cores else "hc"), k_opt)] + if used_tensor_cores: + lins.append(("hc", Linearizer(*ast, opts=renderer))) + lins[-1][1].hand_coded_optimizations() + timed = sorted([(nm, tk, time_linearizer(tk, rawbufs, allow_test_size=False, clear_l2=True)) for nm, tk in lins], key=lambda x: x[2]) + if DEBUG >= 1: print(" < ".join(f"{nm:6s} : {lin.colored_shape(30, dense=True)} : {tm*1e6:8.2f} us" for nm, lin, tm in timed)) + k = timed[0][1] + if logkerns is not None and logkerns_level > 1: logkerns.writelines([f"{(lin.ast, lin.applied_opts)}\n" for (_,lin,_) in timed[1:]]) + # TODO: check the correctness inline once compare_linearizer is in core + if logkerns is not None: logkerns.writelines([f"{(k.ast, k.applied_opts)}\n"]) + if DEBUG >= 4: print((k.ast, k.applied_opts)) # print here to show final applied_opts for all kernels instead of just in beam_search + return k # **************** Runners **************** +class Runner: + def __init__(self, display_name:str, dname:str, op_estimate:sint=0, mem_estimate:sint=0): + self.first_run, self.display_name, self.dname, self.op_estimate, self.mem_estimate = True, display_name, dname, op_estimate, mem_estimate + @property + def device(self): return Device[self.dname] + def exec(self, rawbufs:List[Buffer], var_vals:Optional[Dict[Variable, int]]=None) -> Optional[float]: + return self(rawbufs, {} if var_vals is None else var_vals) + def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False) -> Optional[float]: + raise NotImplementedError("override this") + +class CompiledRunner(Runner): + def __init__(self, p:Program, precompiled:Optional[bytes]=None): + if DEBUG >= 4: print(p.src) + self.p:Program = p + self.lib:bytes = precompiled if precompiled is not None else Device[p.dname].compiler.compile_cached(p.src) + self.clprg = Device[p.dname].runtime(p.function_name, self.lib) + super().__init__(p.name, p.dname, p.op_estimate, p.mem_estimate) + + def __reduce__(self): return self.__class__, (self.p, self.lib) + + def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False) -> Optional[float]: + global_size, local_size = self.p.launch_dims(var_vals) + if global_size is not None and local_size is None and all_int(self.p.global_size): # type: ignore[arg-type] + # TODO: this is copied from get_program + from tinygrad.features.search import optimize_local_size + local_size = optimize_local_size(self.clprg, global_size, rawbufs) + global_size = [g//l if g%l == 0 else g/l for g,l in zip(global_size, local_size)] + self.p = replace(self.p, global_size=global_size, local_size=local_size) + lra = {} + if global_size: + lra['global_size'] = global_size + assert len(global_size) == 3, "global size must have len 3" + if local_size: + lra['local_size'] = local_size + assert len(local_size) == 3, "local size must have len 3" + return self.clprg(*[x._buf for x in rawbufs], **lra, vals=tuple(var_vals[k] for k in self.p.vars), wait=wait) + class CustomOp(Runner): def __init__(self, fxn): self.fxn = fxn @@ -53,6 +123,20 @@ class BufferXfer(BufferCopy): src.allocator.track_cross_device.add(dest.allocator.device) dest.allocator.transfer(dest._buf, src._buf, dest.nbytes, src_dev=src.allocator.device, dest_dev=dest.allocator.device) +# **************** method cache **************** + +method_cache: Dict[Tuple[str, Tuple[LazyOp, ...], int, bool], CompiledRunner] = {} +def get_runner(dname:str, ast:Tuple[LazyOp, ...]) -> CompiledRunner: + ckey = (dname, ast, BEAM.value, False) + if cret:=method_cache.get(ckey): return cret + bkey = (dname.split(":")[0], ast, BEAM.value, True) + if bret:=method_cache.get(bkey): + method_cache[ckey] = ret = CompiledRunner(replace(bret.p, dname=dname), bret.lib) + else: + prg: Program = get_linearizer(Device[dname].renderer, ast).to_program() + method_cache[ckey] = method_cache[bkey] = ret = CompiledRunner(replace(prg, dname=dname)) + return ret + # **************** lowering functions **************** @dataclass(frozen=True) @@ -77,7 +161,7 @@ class ExecItem: def lower_schedule_item(si:ScheduleItem) -> ExecItem: assert len(set(x.device for x in si.bufs)) == 1 or si.ast[0].op is LoadOps.COPY or getenv("USE_COPY_KERNEL") if si.ast[0].op is BufferOps.STORE: - runner = Device[si.outputs[0].device].get_runner(*si.ast) + runner = get_runner(si.outputs[0].device, si.ast) return ExecItem(runner, [si.bufs[x[0]] for x in runner.p.globals]) assert len(si.ast) == 1 and len(si.outputs) == 1, "only ASTRunner supports multioutput" out, ast = si.outputs[0], si.ast[0] diff --git a/tinygrad/features/search.py b/tinygrad/features/search.py index 40f0855cef..5d27cc486a 100644 --- a/tinygrad/features/search.py +++ b/tinygrad/features/search.py @@ -2,7 +2,7 @@ from typing import Dict, List, cast, DefaultDict, Optional, Tuple, Callable import itertools, functools, random, math, time, multiprocessing, traceback, signal from collections import defaultdict from dataclasses import replace -from tinygrad.device import Device, Buffer, CompiledRunner, Compiler, Program +from tinygrad.device import Device, Buffer, Compiler from tinygrad.ops import MemBuffer from tinygrad.helpers import prod, flatten, DEBUG, CACHELEVEL, diskcache_get, diskcache_put, getenv, Context, colored, to_function_name from tinygrad.dtype import ImageDType @@ -11,6 +11,8 @@ from tinygrad.codegen.kernel import Opt, OptOps, KernelOptError from tinygrad.codegen.uops import UOpGraph from tinygrad.tensor import Tensor from tinygrad.shape.symbolic import sym_infer +from tinygrad.engine.realize import CompiledRunner +from tinygrad.renderer import Program actions = [Opt(op=OptOps.UPCAST, axis=axis, amt=amt) for amt in [0,2,3,4,5,7] for axis in range(6)] actions += [Opt(op=OptOps.UNROLL, axis=axis, amt=amt) for amt in [0,4,7] for axis in range(4)] diff --git a/tinygrad/runtime/graph/clang.py b/tinygrad/runtime/graph/clang.py index ec57e038c5..b1fb5d02ab 100644 --- a/tinygrad/runtime/graph/clang.py +++ b/tinygrad/runtime/graph/clang.py @@ -2,8 +2,8 @@ from typing import List, Dict, cast import ctypes from tinygrad.helpers import dedup, cpu_time_execution, GraphException, DEBUG from tinygrad.engine.jit import GraphRunner -from tinygrad.device import Buffer, Device, CompiledRunner -from tinygrad.engine.realize import ExecItem +from tinygrad.device import Buffer, Device +from tinygrad.engine.realize import ExecItem, CompiledRunner from tinygrad.shape.symbolic import Variable from tinygrad.runtime.ops_clang import ClangProgram from tinygrad.renderer.cstyle import ClangRenderer diff --git a/tinygrad/runtime/graph/cuda.py b/tinygrad/runtime/graph/cuda.py index 3f2ad0f4cc..90116f0aba 100644 --- a/tinygrad/runtime/graph/cuda.py +++ b/tinygrad/runtime/graph/cuda.py @@ -2,10 +2,10 @@ import ctypes from typing import Any, Optional, Tuple, Dict, List, cast import tinygrad.runtime.autogen.cuda as cuda from tinygrad.helpers import init_c_var, GraphException -from tinygrad.device import CompiledRunner, Buffer, Device +from tinygrad.device import Buffer, Device from tinygrad.runtime.ops_cuda import CUDADevice, check, encode_args, cu_time_execution from tinygrad.shape.symbolic import Variable -from tinygrad.engine.realize import ExecItem, BufferXfer +from tinygrad.engine.realize import ExecItem, BufferXfer, CompiledRunner from tinygrad.engine.jit import MultiGraphRunner class CUDAGraph(MultiGraphRunner): diff --git a/tinygrad/runtime/graph/hcq.py b/tinygrad/runtime/graph/hcq.py index 73420cbce2..b064ba79ca 100644 --- a/tinygrad/runtime/graph/hcq.py +++ b/tinygrad/runtime/graph/hcq.py @@ -2,9 +2,9 @@ import ctypes, collections, array, time from typing import List, Any, Dict, cast, Optional, Tuple, Set from tinygrad.helpers import GraphException, round_up, to_mv from tinygrad.device import Buffer, BufferOptions -from tinygrad.device import Compiled, CompiledRunner, Device +from tinygrad.device import Compiled, Device from tinygrad.shape.symbolic import Variable -from tinygrad.engine.realize import ExecItem, BufferXfer +from tinygrad.engine.realize import ExecItem, BufferXfer, CompiledRunner from tinygrad.engine.jit import MultiGraphRunner class HCQGraph(MultiGraphRunner): diff --git a/tinygrad/runtime/graph/hsa.py b/tinygrad/runtime/graph/hsa.py index d96076eabb..07c3d66c61 100644 --- a/tinygrad/runtime/graph/hsa.py +++ b/tinygrad/runtime/graph/hsa.py @@ -2,10 +2,10 @@ import ctypes, collections, time, itertools from typing import List, Any, Dict, cast, Optional, Tuple from tinygrad.helpers import GraphException, init_c_var, round_up from tinygrad.device import Buffer, BufferOptions -from tinygrad.device import Compiled, CompiledRunner, Device +from tinygrad.device import Compiled, Device from tinygrad.shape.symbolic import Variable from tinygrad.runtime.ops_hsa import HSADevice, PROFILE, Profiler -from tinygrad.engine.realize import ExecItem, BufferXfer +from tinygrad.engine.realize import ExecItem, BufferXfer, CompiledRunner from tinygrad.engine.jit import MultiGraphRunner import tinygrad.runtime.autogen.hsa as hsa from tinygrad.runtime.driver.hsa import check, AQLQueue, AQL_PACKET_SIZE, EMPTY_SIGNAL diff --git a/tinygrad/runtime/graph/metal.py b/tinygrad/runtime/graph/metal.py index 5d7baef0aa..f1f5524213 100644 --- a/tinygrad/runtime/graph/metal.py +++ b/tinygrad/runtime/graph/metal.py @@ -2,8 +2,8 @@ from typing import List, Any, Dict, cast, Optional import Metal from tinygrad.dtype import dtypes from tinygrad.helpers import dedup, unwrap2, GraphException -from tinygrad.device import Buffer, CompiledRunner -from tinygrad.engine.realize import ExecItem +from tinygrad.device import Buffer +from tinygrad.engine.realize import ExecItem, CompiledRunner from tinygrad.engine.jit import GraphRunner from tinygrad.shape.symbolic import Variable from tinygrad.runtime.ops_metal import wait_check