all realize 2 (#4527)

* all realize 2

* tests fixup

* fix more tests

* fix openpilot

* fix tests

* unneeded
This commit is contained in:
George Hotz
2024-05-10 22:43:09 -07:00
committed by GitHub
parent d2c347fc74
commit 2f970a4fc2
21 changed files with 142 additions and 139 deletions

View File

@@ -55,11 +55,12 @@ alu = LazyOp(BinaryOps.ADD, (ld_1, ld_2))
st_0 = LazyOp(BufferOps.STORE, (alu,), MemBuffer(0, dtypes.int32, ShapeTracker.from_shape((1,))))
# convert the computation to a "linearized" format (print the format)
lin = Device[DEVICE].get_linearizer(st_0).linearize()
from tinygrad.engine.realize import get_linearizer, CompiledRunner
lin = get_linearizer(Device[DEVICE].renderer, (st_0,)).linearize()
for u in lin.uops: print(u)
# compile a program (and print the source)
fxn = Device[DEVICE].to_runner(lin)
fxn = CompiledRunner(lin.to_program())
print(fxn.p.src)
# NOTE: fxn.clprg is the ClangProgram

View File

@@ -1,6 +1,6 @@
from typing import Tuple, Dict, List
from tinygrad.dtype import DType
from tinygrad.device import Program
from tinygrad.renderer import Program
from tinygrad.tensor import Device, Tensor
from tinygrad.engine.jit import TinyJit
from tinygrad.nn.state import get_state_dict

View File

@@ -15,9 +15,9 @@ from typing import Tuple, List, Optional, Dict, cast
from extra.onnx import get_run_onnx
from tinygrad import Tensor, Device, GlobalCounters, dtypes
from tinygrad.dtype import ImageDType
from tinygrad.device import CompiledRunner, Buffer
from tinygrad.device import Buffer
from tinygrad.helpers import partition, Context, fetch, getenv, DEBUG
from tinygrad.engine.realize import run_schedule, lower_schedule, ExecItem
from tinygrad.engine.realize import run_schedule, lower_schedule, ExecItem, CompiledRunner
from tinygrad.engine.memory import memory_planner
from tinygrad.engine.schedule import create_schedule
from tinygrad.ops import LoadOps, ScheduleItem

View File

@@ -4,7 +4,8 @@ from examples.llama import Transformer, MODEL_PARAMS
from tinygrad.tensor import Tensor
from tinygrad import Device
from tinygrad.nn.state import get_state_dict
from tinygrad.device import Allocator, method_cache
from tinygrad.device import Allocator
from tinygrad.engine.realize import method_cache
from tinygrad.helpers import Profiling
class FakeProgram:

View File

@@ -9,6 +9,7 @@ from tinygrad.codegen.linearizer import Linearizer, UOp
from tinygrad.codegen.kernel import Opt, OptOps
from tinygrad.features.search import get_linearizer_actions, bufs_from_lin
from tinygrad.features.graph import print_tree
from tinygrad.engine.realize import CompiledRunner
from tinygrad.helpers import getenv, from_mv, prod, colored, Context, DEBUG
from tinygrad.ops import LazyOp, UnaryOps, BufferOps
@@ -55,7 +56,7 @@ def run_linearizer(lin: Linearizer, rawbufs=None, var_vals=None):
# TODO: images needs required_optimization
try:
prg = device.to_runner(lin)
prg = CompiledRunner(lin.to_program())
except Exception:
traceback.print_exc()
return "COMPILE_ERROR"

View File

@@ -1,7 +1,7 @@
import sys
import numpy as np
from tinygrad import Tensor, Device, dtypes
from tinygrad.device import Runner
from tinygrad.engine.realize import Runner
from tinygrad.dtype import DType
from tinygrad.nn.state import get_parameters
from tinygrad.helpers import Context, CI, OSX, getenv

View File

@@ -10,8 +10,10 @@ from tinygrad.dtype import dtypes
# *** first, we implement the atan2 op at the lowest level ***
# `atan2_gpu` for GPUBuffers and `atan2_cpu` for CPUBuffers
from tinygrad.lazy import Buffer, create_lazybuffer
from tinygrad.device import CompiledRunner, Device, Program
from tinygrad.device import Device
from tinygrad.shape.shapetracker import ShapeTracker
from tinygrad.engine.realize import CompiledRunner
from tinygrad.renderer import Program
# we don't always have GPU support, so the type signature is the abstract CompiledBuffer instead of GPUBuffer
def atan2_gpu(ret:Buffer, a:Buffer, b:Buffer):

View File

@@ -1,5 +1,6 @@
import numpy as np
import unittest
from dataclasses import replace
from tinygrad.codegen.kernel import Opt, OptOps, KernelOptError, tensor_cores
from tinygrad.codegen.linearizer import Linearizer, UOp, UOps, expand_node, expand_idxs
@@ -10,7 +11,7 @@ from tinygrad.shape.view import View
from tinygrad.shape.symbolic import MulNode, Variable, NumNode, Node
from tinygrad.tensor import Tensor
from tinygrad.engine.schedule import create_schedule
from tinygrad.engine.realize import run_schedule, lower_schedule
from tinygrad.engine.realize import run_schedule, lower_schedule, CompiledRunner
from tinygrad.helpers import prod, Context, getenv, CI
from tinygrad.dtype import DType, dtypes
from tinygrad.codegen.uops import UOpGraph
@@ -269,7 +270,7 @@ class TestLinearizer(unittest.TestCase):
assert len([uop for uop in k.uops if uop.uop is UOps.WMMA]) > 0, "tensor core not triggered"
assert len([x for x in k.applied_opts if x.op is OptOps.TC]) == 1, "tensor core opt not included"
prg = Device[Device.DEFAULT].to_runner(k)
prg = CompiledRunner(k.to_program())
real_bufs[0].copyin(np.zeros((real_bufs[0].size, ), dtype=real_bufs[0].dtype.np).data) # Zero to check that all values are filled
prg.exec(real_bufs)
result = np.frombuffer(real_bufs[0].as_buffer(), real_bufs[0].dtype.np)
@@ -586,7 +587,9 @@ def helper_linearizer_opt(r:Tensor, opts=[], apply_tc=False, atol=1e-4, rtol=1e-
wanna_output = None
realized_ast, real_bufs = helper_realized_ast(r)
def check_opt(opts, create_k, to_prg, expected_color_size):
def get_prg(k:Linearizer): return CompiledRunner(replace(k.to_program(), dname=Device.DEFAULT))
def check_opt(opts, create_k, expected_color_size):
k = create_k()
if apply_tc:
assert k.apply_tensor_cores(1, extra_opts=opts), "no tensor core triggered"
@@ -595,26 +598,26 @@ def helper_linearizer_opt(r:Tensor, opts=[], apply_tc=False, atol=1e-4, rtol=1e-
k.apply_opt(opt)
if expected_color_size is not None:
assert (cs:=[(x,y) for x,y in zip(k.colors(), k.full_shape)]) == expected_color_size, f"expected={expected_color_size} got={cs}"
prg = to_prg(k)
prg = get_prg(k)
real_bufs[0].copyin(np.zeros((real_bufs[0].size, ), dtype=real_bufs[0].dtype.np).data) # Zero to check that all values are filled
prg.exec(real_bufs)
np.testing.assert_allclose(np.frombuffer(real_bufs[0].as_buffer(), real_bufs[0].dtype.np), wanna_output, atol=atol, rtol=rtol)
# Get baseline, which is not optimized at all.
k = Linearizer(realized_ast)
prg = Device[Device.DEFAULT].to_runner(k)
prg = get_prg(k)
prg.exec(real_bufs)
wanna_output = np.frombuffer(real_bufs[0].as_buffer(), real_bufs[0].dtype.np).copy()
# Check correctness of handcoded optimiztions.
k = Linearizer(realized_ast)
k.hand_coded_optimizations()
prg = Device[Device.DEFAULT].to_runner(k)
prg = get_prg(k)
real_bufs[0].copyin(np.zeros((real_bufs[0].size, ), dtype=real_bufs[0].dtype.np).data) # Zero to check that all values are filled
prg.exec(real_bufs)
np.testing.assert_allclose(wanna_output, np.frombuffer(real_bufs[0].as_buffer(), real_bufs[0].dtype.np), atol=atol, rtol=rtol)
for i, x in enumerate(opts): # Check custom transformations if any.
check_opt(x, lambda: Linearizer(realized_ast), Device[Device.DEFAULT].to_runner, color_sizes[i] if i < len(color_sizes) else None)
check_opt(x, lambda: Linearizer(realized_ast), color_sizes[i] if i < len(color_sizes) else None)
class TestKernelOpts(unittest.TestCase):
def test_local_and_grouped_reduce(self):

View File

@@ -1,12 +1,11 @@
import unittest, functools, random
from typing import List
from tinygrad import Tensor, Device, nn, GlobalCounters, TinyJit, dtypes
from tinygrad.device import CompiledRunner
from tinygrad.ops import LoadOps, ReduceOps
from tinygrad.helpers import CI, prod, Context
from tinygrad.nn.state import get_parameters, get_state_dict
from tinygrad.engine.schedule import create_schedule
from tinygrad.engine.realize import lower_schedule, BufferCopy
from tinygrad.engine.realize import lower_schedule, BufferCopy, CompiledRunner
from tinygrad.features.multi import all_reduce, MultiLazyBuffer
from random import randint
import numpy as np

View File

@@ -4,9 +4,11 @@ import numpy as np
from tinygrad.tensor import Tensor
from tinygrad.helpers import getenv
from tinygrad.dtype import dtypes, DType, PtrDType
from tinygrad.device import Buffer, Device, CompiledRunner, Program
from tinygrad.device import Buffer, Device
from tinygrad.ops import UnaryOps, BinaryOps, TernaryOps
from tinygrad.renderer import Program
from tinygrad.engine.schedule import create_schedule
from tinygrad.engine.realize import CompiledRunner, lower_schedule_item
from tinygrad.codegen.linearizer import UOps, UOp
from tinygrad.codegen.uops import exec_alu, UOpGraph
from test.helpers import is_dtype_supported
@@ -210,9 +212,8 @@ class TestConstantFolding(unittest.TestCase):
t = Tensor(1, dtype=dtypes.float).bitcast(dtypes.int)
si = create_schedule([t.lazydata])
assert len(si) == 1
si = si[0]
lin = Device[Device.DEFAULT].get_linearizer(si.ast[0]).linearize()
assert any(uop.uop is UOps.BITCAST for uop in lin.uops.uops), f"{[uop.uop for uop in lin.uops.uops]} does not contain bitcast"
ji = lower_schedule_item(si[-1])
assert any(uop.uop is UOps.BITCAST for uop in ji.prg.p.uops), f"{[uop.uop for uop in ji.prg.p.uops]} does not contain bitcast"
class TestLocalAccess(unittest.TestCase):
@unittest.skipIf(Device.DEFAULT in {"LLVM"}, "device doesn't support local memory")

View File

@@ -14,7 +14,7 @@ from tinygrad.engine.realize import lower_schedule_item
def get_stats(x:Tensor):
si = create_schedule([x.lazydata])[-1]
ei = lower_schedule_item(si)
return ei.prg.p.op_estimate, ei.prg.p.mem_estimate
return ei.prg.op_estimate, ei.prg.mem_estimate
class TestUOpsStats(unittest.TestCase):
def test_simple_add(self):

View File

@@ -59,9 +59,6 @@ tensor_cores: Dict[str, List[TensorCore]] = {
"HSA": [TensorCore(dims=(16,16,16), threads=[(0,8),(0,2),(1,2)], thread_local_sizes=[[16],[16],[4,2]], thread_local_aliases=[ [[0],[0],[2],[-1],[1]], [[1],[2],[0],[-1],[0]], [[1],[2],[-2],[0],[3,-1]] ], dtype_in=di, dtype_out=do) for (di, do) in [(dtypes.half, dtypes.float), (dtypes.half, dtypes.half)]], # noqa: E501
"CUDA": [TensorCore(dims=(8,16,16), threads=[(0,2),(0,2),(1,2),(1,2),(0,2)], thread_local_sizes=[[2,2,2],[2,2],[2,2]], thread_local_aliases=[ [[0],[0],[5],[-2],[0],[-1,1,2,-3],[3,4]], [[3],[4],[0],[0],[5],[-1,1,2,-2],[0]], [[-1],[1],[5],[-2],[2],[0],[3,4]] ], dtype_in=di, dtype_out=do) for (di, do) in ([(dtypes.half, dtypes.float)] if getenv("PTX") else [(dtypes.half, dtypes.float), (dtypes.bfloat16, dtypes.float)])], # noqa: E501
}
tensor_cores["AMD"] = tensor_cores["HSA"]
tensor_cores["RHIP"] = tensor_cores["HSA"]
tensor_cores["NV"] = tensor_cores["CUDA"]
class LocalBuffer(NamedTuple):
name: str

View File

@@ -1,17 +1,12 @@
from __future__ import annotations
import multiprocessing
from dataclasses import dataclass, replace
from dataclasses import dataclass
from collections import defaultdict
from typing import TYPE_CHECKING, List, Optional, Dict, Tuple, Any
from typing import List, Optional, Dict, Tuple, Any
import importlib, inspect, functools, pathlib, os, ctypes
from tinygrad.helpers import getenv, all_int, diskcache_get, diskcache_put, DEBUG,BEAM,NOOPT, GlobalCounters, flat_mv, from_mv
from tinygrad.shape.symbolic import Variable, sint
from tinygrad.helpers import getenv, diskcache_get, diskcache_put, DEBUG, GlobalCounters, flat_mv, from_mv
from tinygrad.dtype import DType, ImageDType
from tinygrad.ops import LazyOp
from tinygrad.renderer import Renderer, Program
if TYPE_CHECKING:
from tinygrad.codegen.linearizer import Linearizer
from tinygrad.renderer import Renderer
# **************** Device ****************
@@ -167,18 +162,6 @@ class _MallocAllocator(LRUAllocator):
MallocAllocator = _MallocAllocator()
# **************** base Runner + helpers ****************
class Runner:
def __init__(self, display_name:str, dname:str, op_estimate:sint=0, mem_estimate:sint=0):
self.first_run, self.display_name, self.dname, self.op_estimate, self.mem_estimate = True, display_name, dname, op_estimate, mem_estimate
@property
def device(self): return Device[self.dname]
def exec(self, rawbufs:List[Buffer], var_vals:Optional[Dict[Variable, int]]=None) -> Optional[float]:
return self(rawbufs, {} if var_vals is None else var_vals)
def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False) -> Optional[float]:
raise NotImplementedError("override this")
# **************** for Compiled Devices ****************
class Compiler:
@@ -190,79 +173,8 @@ class Compiler:
if self.cachekey is not None: diskcache_put(self.cachekey, src, lib)
return lib
class CompiledRunner(Runner):
def __init__(self, p:Program, precompiled:Optional[bytes]=None):
if DEBUG >= 4: print(p.src)
self.p:Program = p
self.lib:bytes = precompiled if precompiled is not None else Device[p.dname].compiler.compile_cached(p.src)
self.clprg = Device[p.dname].runtime(p.function_name, self.lib)
super().__init__(p.name, p.dname, p.op_estimate, p.mem_estimate)
def __reduce__(self): return self.__class__, (self.p, self.lib)
def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False) -> Optional[float]:
global_size, local_size = self.p.launch_dims(var_vals)
if global_size is not None and local_size is None and all_int(self.p.global_size): # type: ignore[arg-type]
# TODO: this is copied from get_program
from tinygrad.features.search import optimize_local_size
local_size = optimize_local_size(self.clprg, global_size, rawbufs)
global_size = [g//l if g%l == 0 else g/l for g,l in zip(global_size, local_size)]
self.p = replace(self.p, global_size=global_size, local_size=local_size)
lra = {}
if global_size:
lra['global_size'] = global_size
assert len(global_size) == 3, "global size must have len 3"
if local_size:
lra['local_size'] = local_size
assert len(local_size) == 3, "local size must have len 3"
return self.clprg(*[x._buf for x in rawbufs], **lra, vals=tuple(var_vals[k] for k in self.p.vars), wait=wait)
method_cache: Dict[Tuple[str, Tuple[LazyOp, ...], int, bool], CompiledRunner] = {}
logkerns, logkerns_level = open(getenv("LOGKERNS", ""), "a") if getenv("LOGKERNS", "") else None, getenv("LOGKERNS_LEVEL", 1)
class Compiled:
def __init__(self, device:str, allocator:Allocator, renderer:Optional[Renderer], compiler:Optional[Compiler], runtime, graph=None):
self.dname, self.allocator, self.compiler, self.runtime, self.graph = device, allocator, compiler if compiler else Compiler(), runtime, graph
self.renderer = renderer if renderer else Renderer()
def synchronize(self): pass # override this in your device
def to_runner(self, k:Linearizer) -> CompiledRunner: return CompiledRunner(replace(k.to_program(), dname=self.dname))
def get_linearizer(self, *ast:LazyOp) -> Linearizer:
if DEBUG >= 3:
from tinygrad.features.graph import print_tree
for op in ast: print_tree(op)
from tinygrad.codegen.linearizer import Linearizer
k = Linearizer(*ast, opts=self.renderer)
k.required_optimizations()
if not NOOPT:
if not (used_tensor_cores:=k.apply_tensor_cores(getenv("TC", 1))): k.hand_coded_optimizations()
if BEAM >= 1:
from tinygrad.features.search import beam_search, time_linearizer, bufs_from_lin
kb, k_opt = Linearizer(*ast, opts=self.renderer), k
kb.required_optimizations()
rawbufs = bufs_from_lin(kb, allocate=False)
k = beam_search(kb, rawbufs, BEAM.value, bool(getenv("BEAM_ESTIMATE", 1)))
if getenv("BEAM_COMPARE", 1):
# TODO: move the HC/TC/BEAM compare to beam_search so it can be optionally cached which choice is better
lins: List[Tuple[str, Linearizer]] = [(f"beam{BEAM.value}", k), (("tc" if used_tensor_cores else "hc"), k_opt)]
if used_tensor_cores:
lins.append(("hc", Linearizer(*ast, opts=self.renderer)))
lins[-1][1].hand_coded_optimizations()
timed = sorted([(nm, tk, time_linearizer(tk, rawbufs, allow_test_size=False, clear_l2=True)) for nm, tk in lins], key=lambda x: x[2])
if DEBUG >= 1: print(" < ".join(f"{nm:6s} : {lin.colored_shape(30, dense=True)} : {tm*1e6:8.2f} us" for nm, lin, tm in timed))
k = timed[0][1]
if logkerns is not None and logkerns_level > 1: logkerns.writelines([f"{(lin.ast, lin.applied_opts)}\n" for (_,lin,_) in timed[1:]])
# TODO: check the correctness inline once compare_linearizer is in core
if logkerns is not None: logkerns.writelines([f"{(k.ast, k.applied_opts)}\n"])
if DEBUG >= 4: print((k.ast, k.applied_opts)) # print here to show final applied_opts for all kernels instead of just in beam_search
return k
def get_runner(self, *ast:LazyOp) -> CompiledRunner:
ckey = (self.dname, ast, BEAM.value, False)
if cret:=method_cache.get(ckey): return cret
bkey = (self.dname.split(":")[0], ast, BEAM.value, True)
if bret:=method_cache.get(bkey):
method_cache[ckey] = ret = CompiledRunner(replace(bret.p, dname=self.dname), bret.lib)
else:
method_cache[ckey] = method_cache[bkey] = ret = self.to_runner(self.get_linearizer(*ast))
return ret

View File

@@ -4,11 +4,11 @@ import functools, itertools, collections
from tinygrad.tensor import Tensor
from tinygrad.lazy import LazyBuffer
from tinygrad.helpers import flatten, merge_dicts, DEBUG, Context, GRAPH, BEAM, getenv, all_int, GraphException, colored, JIT
from tinygrad.device import Buffer, CompiledRunner, Compiled, Device, Runner
from tinygrad.device import Buffer, Compiled, Device
from tinygrad.dtype import DType
from tinygrad.shape.shapetracker import ShapeTracker
from tinygrad.shape.symbolic import Variable, sint
from tinygrad.engine.realize import ExecItem, capturing, EmptyOp, ViewOp, BufferXfer
from tinygrad.engine.realize import ExecItem, capturing, EmptyOp, ViewOp, BufferXfer, CompiledRunner, Runner
from tinygrad.engine.memory import _internal_memory_planner
from tinygrad.nn.state import get_parameters
from weakref import WeakKeyDictionary

View File

@@ -1,14 +1,84 @@
from typing import List, Dict, Optional, cast, Generator
from typing import List, Dict, Optional, cast, Generator, Tuple
import time
from dataclasses import dataclass
from tinygrad.helpers import colored, getenv, DEBUG, GlobalCounters, ansilen
from tinygrad.ops import ScheduleItem, BufferOps, LoadOps
from tinygrad.device import Runner, Device
from tinygrad.device import Buffer
from tinygrad.shape.symbolic import Variable, sym_infer
from dataclasses import dataclass, replace
from tinygrad.helpers import colored, getenv, DEBUG, GlobalCounters, ansilen, BEAM, NOOPT, all_int
from tinygrad.ops import ScheduleItem, BufferOps, LoadOps, LazyOp
from tinygrad.device import Device, Buffer
from tinygrad.shape.symbolic import Variable, sym_infer, sint
from tinygrad.renderer import Renderer, Program
from tinygrad.codegen.linearizer import Linearizer
# **************** Program Creation ****************
logkerns, logkerns_level = open(getenv("LOGKERNS", ""), "a") if getenv("LOGKERNS", "") else None, getenv("LOGKERNS_LEVEL", 1)
def get_linearizer(renderer:Renderer, ast:Tuple[LazyOp, ...]) -> Linearizer:
if DEBUG >= 3:
from tinygrad.features.graph import print_tree
for op in ast: print_tree(op)
k = Linearizer(*ast, opts=renderer)
k.required_optimizations()
if not NOOPT:
if not (used_tensor_cores:=k.apply_tensor_cores(getenv("TC", 1))): k.hand_coded_optimizations()
if BEAM >= 1:
from tinygrad.features.search import beam_search, time_linearizer, bufs_from_lin
kb, k_opt = Linearizer(*ast, opts=renderer), k
kb.required_optimizations()
rawbufs = bufs_from_lin(kb, allocate=False)
k = beam_search(kb, rawbufs, BEAM.value, bool(getenv("BEAM_ESTIMATE", 1)))
if getenv("BEAM_COMPARE", 1):
# TODO: move the HC/TC/BEAM compare to beam_search so it can be optionally cached which choice is better
lins: List[Tuple[str, Linearizer]] = [(f"beam{BEAM.value}", k), (("tc" if used_tensor_cores else "hc"), k_opt)]
if used_tensor_cores:
lins.append(("hc", Linearizer(*ast, opts=renderer)))
lins[-1][1].hand_coded_optimizations()
timed = sorted([(nm, tk, time_linearizer(tk, rawbufs, allow_test_size=False, clear_l2=True)) for nm, tk in lins], key=lambda x: x[2])
if DEBUG >= 1: print(" < ".join(f"{nm:6s} : {lin.colored_shape(30, dense=True)} : {tm*1e6:8.2f} us" for nm, lin, tm in timed))
k = timed[0][1]
if logkerns is not None and logkerns_level > 1: logkerns.writelines([f"{(lin.ast, lin.applied_opts)}\n" for (_,lin,_) in timed[1:]])
# TODO: check the correctness inline once compare_linearizer is in core
if logkerns is not None: logkerns.writelines([f"{(k.ast, k.applied_opts)}\n"])
if DEBUG >= 4: print((k.ast, k.applied_opts)) # print here to show final applied_opts for all kernels instead of just in beam_search
return k
# **************** Runners ****************
class Runner:
def __init__(self, display_name:str, dname:str, op_estimate:sint=0, mem_estimate:sint=0):
self.first_run, self.display_name, self.dname, self.op_estimate, self.mem_estimate = True, display_name, dname, op_estimate, mem_estimate
@property
def device(self): return Device[self.dname]
def exec(self, rawbufs:List[Buffer], var_vals:Optional[Dict[Variable, int]]=None) -> Optional[float]:
return self(rawbufs, {} if var_vals is None else var_vals)
def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False) -> Optional[float]:
raise NotImplementedError("override this")
class CompiledRunner(Runner):
def __init__(self, p:Program, precompiled:Optional[bytes]=None):
if DEBUG >= 4: print(p.src)
self.p:Program = p
self.lib:bytes = precompiled if precompiled is not None else Device[p.dname].compiler.compile_cached(p.src)
self.clprg = Device[p.dname].runtime(p.function_name, self.lib)
super().__init__(p.name, p.dname, p.op_estimate, p.mem_estimate)
def __reduce__(self): return self.__class__, (self.p, self.lib)
def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False) -> Optional[float]:
global_size, local_size = self.p.launch_dims(var_vals)
if global_size is not None and local_size is None and all_int(self.p.global_size): # type: ignore[arg-type]
# TODO: this is copied from get_program
from tinygrad.features.search import optimize_local_size
local_size = optimize_local_size(self.clprg, global_size, rawbufs)
global_size = [g//l if g%l == 0 else g/l for g,l in zip(global_size, local_size)]
self.p = replace(self.p, global_size=global_size, local_size=local_size)
lra = {}
if global_size:
lra['global_size'] = global_size
assert len(global_size) == 3, "global size must have len 3"
if local_size:
lra['local_size'] = local_size
assert len(local_size) == 3, "local size must have len 3"
return self.clprg(*[x._buf for x in rawbufs], **lra, vals=tuple(var_vals[k] for k in self.p.vars), wait=wait)
class CustomOp(Runner):
def __init__(self, fxn):
self.fxn = fxn
@@ -53,6 +123,20 @@ class BufferXfer(BufferCopy):
src.allocator.track_cross_device.add(dest.allocator.device)
dest.allocator.transfer(dest._buf, src._buf, dest.nbytes, src_dev=src.allocator.device, dest_dev=dest.allocator.device)
# **************** method cache ****************
method_cache: Dict[Tuple[str, Tuple[LazyOp, ...], int, bool], CompiledRunner] = {}
def get_runner(dname:str, ast:Tuple[LazyOp, ...]) -> CompiledRunner:
ckey = (dname, ast, BEAM.value, False)
if cret:=method_cache.get(ckey): return cret
bkey = (dname.split(":")[0], ast, BEAM.value, True)
if bret:=method_cache.get(bkey):
method_cache[ckey] = ret = CompiledRunner(replace(bret.p, dname=dname), bret.lib)
else:
prg: Program = get_linearizer(Device[dname].renderer, ast).to_program()
method_cache[ckey] = method_cache[bkey] = ret = CompiledRunner(replace(prg, dname=dname))
return ret
# **************** lowering functions ****************
@dataclass(frozen=True)
@@ -77,7 +161,7 @@ class ExecItem:
def lower_schedule_item(si:ScheduleItem) -> ExecItem:
assert len(set(x.device for x in si.bufs)) == 1 or si.ast[0].op is LoadOps.COPY or getenv("USE_COPY_KERNEL")
if si.ast[0].op is BufferOps.STORE:
runner = Device[si.outputs[0].device].get_runner(*si.ast)
runner = get_runner(si.outputs[0].device, si.ast)
return ExecItem(runner, [si.bufs[x[0]] for x in runner.p.globals])
assert len(si.ast) == 1 and len(si.outputs) == 1, "only ASTRunner supports multioutput"
out, ast = si.outputs[0], si.ast[0]

View File

@@ -2,7 +2,7 @@ from typing import Dict, List, cast, DefaultDict, Optional, Tuple, Callable
import itertools, functools, random, math, time, multiprocessing, traceback, signal
from collections import defaultdict
from dataclasses import replace
from tinygrad.device import Device, Buffer, CompiledRunner, Compiler, Program
from tinygrad.device import Device, Buffer, Compiler
from tinygrad.ops import MemBuffer
from tinygrad.helpers import prod, flatten, DEBUG, CACHELEVEL, diskcache_get, diskcache_put, getenv, Context, colored, to_function_name
from tinygrad.dtype import ImageDType
@@ -11,6 +11,8 @@ from tinygrad.codegen.kernel import Opt, OptOps, KernelOptError
from tinygrad.codegen.uops import UOpGraph
from tinygrad.tensor import Tensor
from tinygrad.shape.symbolic import sym_infer
from tinygrad.engine.realize import CompiledRunner
from tinygrad.renderer import Program
actions = [Opt(op=OptOps.UPCAST, axis=axis, amt=amt) for amt in [0,2,3,4,5,7] for axis in range(6)]
actions += [Opt(op=OptOps.UNROLL, axis=axis, amt=amt) for amt in [0,4,7] for axis in range(4)]

View File

@@ -2,8 +2,8 @@ from typing import List, Dict, cast
import ctypes
from tinygrad.helpers import dedup, cpu_time_execution, GraphException, DEBUG
from tinygrad.engine.jit import GraphRunner
from tinygrad.device import Buffer, Device, CompiledRunner
from tinygrad.engine.realize import ExecItem
from tinygrad.device import Buffer, Device
from tinygrad.engine.realize import ExecItem, CompiledRunner
from tinygrad.shape.symbolic import Variable
from tinygrad.runtime.ops_clang import ClangProgram
from tinygrad.renderer.cstyle import ClangRenderer

View File

@@ -2,10 +2,10 @@ import ctypes
from typing import Any, Optional, Tuple, Dict, List, cast
import tinygrad.runtime.autogen.cuda as cuda
from tinygrad.helpers import init_c_var, GraphException
from tinygrad.device import CompiledRunner, Buffer, Device
from tinygrad.device import Buffer, Device
from tinygrad.runtime.ops_cuda import CUDADevice, check, encode_args, cu_time_execution
from tinygrad.shape.symbolic import Variable
from tinygrad.engine.realize import ExecItem, BufferXfer
from tinygrad.engine.realize import ExecItem, BufferXfer, CompiledRunner
from tinygrad.engine.jit import MultiGraphRunner
class CUDAGraph(MultiGraphRunner):

View File

@@ -2,9 +2,9 @@ import ctypes, collections, array, time
from typing import List, Any, Dict, cast, Optional, Tuple, Set
from tinygrad.helpers import GraphException, round_up, to_mv
from tinygrad.device import Buffer, BufferOptions
from tinygrad.device import Compiled, CompiledRunner, Device
from tinygrad.device import Compiled, Device
from tinygrad.shape.symbolic import Variable
from tinygrad.engine.realize import ExecItem, BufferXfer
from tinygrad.engine.realize import ExecItem, BufferXfer, CompiledRunner
from tinygrad.engine.jit import MultiGraphRunner
class HCQGraph(MultiGraphRunner):

View File

@@ -2,10 +2,10 @@ import ctypes, collections, time, itertools
from typing import List, Any, Dict, cast, Optional, Tuple
from tinygrad.helpers import GraphException, init_c_var, round_up
from tinygrad.device import Buffer, BufferOptions
from tinygrad.device import Compiled, CompiledRunner, Device
from tinygrad.device import Compiled, Device
from tinygrad.shape.symbolic import Variable
from tinygrad.runtime.ops_hsa import HSADevice, PROFILE, Profiler
from tinygrad.engine.realize import ExecItem, BufferXfer
from tinygrad.engine.realize import ExecItem, BufferXfer, CompiledRunner
from tinygrad.engine.jit import MultiGraphRunner
import tinygrad.runtime.autogen.hsa as hsa
from tinygrad.runtime.driver.hsa import check, AQLQueue, AQL_PACKET_SIZE, EMPTY_SIGNAL

View File

@@ -2,8 +2,8 @@ from typing import List, Any, Dict, cast, Optional
import Metal
from tinygrad.dtype import dtypes
from tinygrad.helpers import dedup, unwrap2, GraphException
from tinygrad.device import Buffer, CompiledRunner
from tinygrad.engine.realize import ExecItem
from tinygrad.device import Buffer
from tinygrad.engine.realize import ExecItem, CompiledRunner
from tinygrad.engine.jit import GraphRunner
from tinygrad.shape.symbolic import Variable
from tinygrad.runtime.ops_metal import wait_check