mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-06-11 07:27:43 +08:00
all realize 2 (#4527)
* all realize 2 * tests fixup * fix more tests * fix openpilot * fix tests * unneeded
This commit is contained in:
@@ -55,11 +55,12 @@ alu = LazyOp(BinaryOps.ADD, (ld_1, ld_2))
|
||||
st_0 = LazyOp(BufferOps.STORE, (alu,), MemBuffer(0, dtypes.int32, ShapeTracker.from_shape((1,))))
|
||||
|
||||
# convert the computation to a "linearized" format (print the format)
|
||||
lin = Device[DEVICE].get_linearizer(st_0).linearize()
|
||||
from tinygrad.engine.realize import get_linearizer, CompiledRunner
|
||||
lin = get_linearizer(Device[DEVICE].renderer, (st_0,)).linearize()
|
||||
for u in lin.uops: print(u)
|
||||
|
||||
# compile a program (and print the source)
|
||||
fxn = Device[DEVICE].to_runner(lin)
|
||||
fxn = CompiledRunner(lin.to_program())
|
||||
print(fxn.p.src)
|
||||
# NOTE: fxn.clprg is the ClangProgram
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
from typing import Tuple, Dict, List
|
||||
from tinygrad.dtype import DType
|
||||
from tinygrad.device import Program
|
||||
from tinygrad.renderer import Program
|
||||
from tinygrad.tensor import Device, Tensor
|
||||
from tinygrad.engine.jit import TinyJit
|
||||
from tinygrad.nn.state import get_state_dict
|
||||
|
||||
@@ -15,9 +15,9 @@ from typing import Tuple, List, Optional, Dict, cast
|
||||
from extra.onnx import get_run_onnx
|
||||
from tinygrad import Tensor, Device, GlobalCounters, dtypes
|
||||
from tinygrad.dtype import ImageDType
|
||||
from tinygrad.device import CompiledRunner, Buffer
|
||||
from tinygrad.device import Buffer
|
||||
from tinygrad.helpers import partition, Context, fetch, getenv, DEBUG
|
||||
from tinygrad.engine.realize import run_schedule, lower_schedule, ExecItem
|
||||
from tinygrad.engine.realize import run_schedule, lower_schedule, ExecItem, CompiledRunner
|
||||
from tinygrad.engine.memory import memory_planner
|
||||
from tinygrad.engine.schedule import create_schedule
|
||||
from tinygrad.ops import LoadOps, ScheduleItem
|
||||
|
||||
3
test/external/external_test_speed_llama.py
vendored
3
test/external/external_test_speed_llama.py
vendored
@@ -4,7 +4,8 @@ from examples.llama import Transformer, MODEL_PARAMS
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad import Device
|
||||
from tinygrad.nn.state import get_state_dict
|
||||
from tinygrad.device import Allocator, method_cache
|
||||
from tinygrad.device import Allocator
|
||||
from tinygrad.engine.realize import method_cache
|
||||
from tinygrad.helpers import Profiling
|
||||
|
||||
class FakeProgram:
|
||||
|
||||
3
test/external/fuzz_linearizer.py
vendored
3
test/external/fuzz_linearizer.py
vendored
@@ -9,6 +9,7 @@ from tinygrad.codegen.linearizer import Linearizer, UOp
|
||||
from tinygrad.codegen.kernel import Opt, OptOps
|
||||
from tinygrad.features.search import get_linearizer_actions, bufs_from_lin
|
||||
from tinygrad.features.graph import print_tree
|
||||
from tinygrad.engine.realize import CompiledRunner
|
||||
from tinygrad.helpers import getenv, from_mv, prod, colored, Context, DEBUG
|
||||
from tinygrad.ops import LazyOp, UnaryOps, BufferOps
|
||||
|
||||
@@ -55,7 +56,7 @@ def run_linearizer(lin: Linearizer, rawbufs=None, var_vals=None):
|
||||
|
||||
# TODO: images needs required_optimization
|
||||
try:
|
||||
prg = device.to_runner(lin)
|
||||
prg = CompiledRunner(lin.to_program())
|
||||
except Exception:
|
||||
traceback.print_exc()
|
||||
return "COMPILE_ERROR"
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import sys
|
||||
import numpy as np
|
||||
from tinygrad import Tensor, Device, dtypes
|
||||
from tinygrad.device import Runner
|
||||
from tinygrad.engine.realize import Runner
|
||||
from tinygrad.dtype import DType
|
||||
from tinygrad.nn.state import get_parameters
|
||||
from tinygrad.helpers import Context, CI, OSX, getenv
|
||||
|
||||
@@ -10,8 +10,10 @@ from tinygrad.dtype import dtypes
|
||||
# *** first, we implement the atan2 op at the lowest level ***
|
||||
# `atan2_gpu` for GPUBuffers and `atan2_cpu` for CPUBuffers
|
||||
from tinygrad.lazy import Buffer, create_lazybuffer
|
||||
from tinygrad.device import CompiledRunner, Device, Program
|
||||
from tinygrad.device import Device
|
||||
from tinygrad.shape.shapetracker import ShapeTracker
|
||||
from tinygrad.engine.realize import CompiledRunner
|
||||
from tinygrad.renderer import Program
|
||||
|
||||
# we don't always have GPU support, so the type signature is the abstract CompiledBuffer instead of GPUBuffer
|
||||
def atan2_gpu(ret:Buffer, a:Buffer, b:Buffer):
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import numpy as np
|
||||
import unittest
|
||||
from dataclasses import replace
|
||||
|
||||
from tinygrad.codegen.kernel import Opt, OptOps, KernelOptError, tensor_cores
|
||||
from tinygrad.codegen.linearizer import Linearizer, UOp, UOps, expand_node, expand_idxs
|
||||
@@ -10,7 +11,7 @@ from tinygrad.shape.view import View
|
||||
from tinygrad.shape.symbolic import MulNode, Variable, NumNode, Node
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.engine.schedule import create_schedule
|
||||
from tinygrad.engine.realize import run_schedule, lower_schedule
|
||||
from tinygrad.engine.realize import run_schedule, lower_schedule, CompiledRunner
|
||||
from tinygrad.helpers import prod, Context, getenv, CI
|
||||
from tinygrad.dtype import DType, dtypes
|
||||
from tinygrad.codegen.uops import UOpGraph
|
||||
@@ -269,7 +270,7 @@ class TestLinearizer(unittest.TestCase):
|
||||
assert len([uop for uop in k.uops if uop.uop is UOps.WMMA]) > 0, "tensor core not triggered"
|
||||
assert len([x for x in k.applied_opts if x.op is OptOps.TC]) == 1, "tensor core opt not included"
|
||||
|
||||
prg = Device[Device.DEFAULT].to_runner(k)
|
||||
prg = CompiledRunner(k.to_program())
|
||||
real_bufs[0].copyin(np.zeros((real_bufs[0].size, ), dtype=real_bufs[0].dtype.np).data) # Zero to check that all values are filled
|
||||
prg.exec(real_bufs)
|
||||
result = np.frombuffer(real_bufs[0].as_buffer(), real_bufs[0].dtype.np)
|
||||
@@ -586,7 +587,9 @@ def helper_linearizer_opt(r:Tensor, opts=[], apply_tc=False, atol=1e-4, rtol=1e-
|
||||
wanna_output = None
|
||||
realized_ast, real_bufs = helper_realized_ast(r)
|
||||
|
||||
def check_opt(opts, create_k, to_prg, expected_color_size):
|
||||
def get_prg(k:Linearizer): return CompiledRunner(replace(k.to_program(), dname=Device.DEFAULT))
|
||||
|
||||
def check_opt(opts, create_k, expected_color_size):
|
||||
k = create_k()
|
||||
if apply_tc:
|
||||
assert k.apply_tensor_cores(1, extra_opts=opts), "no tensor core triggered"
|
||||
@@ -595,26 +598,26 @@ def helper_linearizer_opt(r:Tensor, opts=[], apply_tc=False, atol=1e-4, rtol=1e-
|
||||
k.apply_opt(opt)
|
||||
if expected_color_size is not None:
|
||||
assert (cs:=[(x,y) for x,y in zip(k.colors(), k.full_shape)]) == expected_color_size, f"expected={expected_color_size} got={cs}"
|
||||
prg = to_prg(k)
|
||||
prg = get_prg(k)
|
||||
real_bufs[0].copyin(np.zeros((real_bufs[0].size, ), dtype=real_bufs[0].dtype.np).data) # Zero to check that all values are filled
|
||||
prg.exec(real_bufs)
|
||||
np.testing.assert_allclose(np.frombuffer(real_bufs[0].as_buffer(), real_bufs[0].dtype.np), wanna_output, atol=atol, rtol=rtol)
|
||||
|
||||
# Get baseline, which is not optimized at all.
|
||||
k = Linearizer(realized_ast)
|
||||
prg = Device[Device.DEFAULT].to_runner(k)
|
||||
prg = get_prg(k)
|
||||
prg.exec(real_bufs)
|
||||
wanna_output = np.frombuffer(real_bufs[0].as_buffer(), real_bufs[0].dtype.np).copy()
|
||||
|
||||
# Check correctness of handcoded optimiztions.
|
||||
k = Linearizer(realized_ast)
|
||||
k.hand_coded_optimizations()
|
||||
prg = Device[Device.DEFAULT].to_runner(k)
|
||||
prg = get_prg(k)
|
||||
real_bufs[0].copyin(np.zeros((real_bufs[0].size, ), dtype=real_bufs[0].dtype.np).data) # Zero to check that all values are filled
|
||||
prg.exec(real_bufs)
|
||||
np.testing.assert_allclose(wanna_output, np.frombuffer(real_bufs[0].as_buffer(), real_bufs[0].dtype.np), atol=atol, rtol=rtol)
|
||||
for i, x in enumerate(opts): # Check custom transformations if any.
|
||||
check_opt(x, lambda: Linearizer(realized_ast), Device[Device.DEFAULT].to_runner, color_sizes[i] if i < len(color_sizes) else None)
|
||||
check_opt(x, lambda: Linearizer(realized_ast), color_sizes[i] if i < len(color_sizes) else None)
|
||||
|
||||
class TestKernelOpts(unittest.TestCase):
|
||||
def test_local_and_grouped_reduce(self):
|
||||
|
||||
@@ -1,12 +1,11 @@
|
||||
import unittest, functools, random
|
||||
from typing import List
|
||||
from tinygrad import Tensor, Device, nn, GlobalCounters, TinyJit, dtypes
|
||||
from tinygrad.device import CompiledRunner
|
||||
from tinygrad.ops import LoadOps, ReduceOps
|
||||
from tinygrad.helpers import CI, prod, Context
|
||||
from tinygrad.nn.state import get_parameters, get_state_dict
|
||||
from tinygrad.engine.schedule import create_schedule
|
||||
from tinygrad.engine.realize import lower_schedule, BufferCopy
|
||||
from tinygrad.engine.realize import lower_schedule, BufferCopy, CompiledRunner
|
||||
from tinygrad.features.multi import all_reduce, MultiLazyBuffer
|
||||
from random import randint
|
||||
import numpy as np
|
||||
|
||||
@@ -4,9 +4,11 @@ import numpy as np
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.helpers import getenv
|
||||
from tinygrad.dtype import dtypes, DType, PtrDType
|
||||
from tinygrad.device import Buffer, Device, CompiledRunner, Program
|
||||
from tinygrad.device import Buffer, Device
|
||||
from tinygrad.ops import UnaryOps, BinaryOps, TernaryOps
|
||||
from tinygrad.renderer import Program
|
||||
from tinygrad.engine.schedule import create_schedule
|
||||
from tinygrad.engine.realize import CompiledRunner, lower_schedule_item
|
||||
from tinygrad.codegen.linearizer import UOps, UOp
|
||||
from tinygrad.codegen.uops import exec_alu, UOpGraph
|
||||
from test.helpers import is_dtype_supported
|
||||
@@ -210,9 +212,8 @@ class TestConstantFolding(unittest.TestCase):
|
||||
t = Tensor(1, dtype=dtypes.float).bitcast(dtypes.int)
|
||||
si = create_schedule([t.lazydata])
|
||||
assert len(si) == 1
|
||||
si = si[0]
|
||||
lin = Device[Device.DEFAULT].get_linearizer(si.ast[0]).linearize()
|
||||
assert any(uop.uop is UOps.BITCAST for uop in lin.uops.uops), f"{[uop.uop for uop in lin.uops.uops]} does not contain bitcast"
|
||||
ji = lower_schedule_item(si[-1])
|
||||
assert any(uop.uop is UOps.BITCAST for uop in ji.prg.p.uops), f"{[uop.uop for uop in ji.prg.p.uops]} does not contain bitcast"
|
||||
|
||||
class TestLocalAccess(unittest.TestCase):
|
||||
@unittest.skipIf(Device.DEFAULT in {"LLVM"}, "device doesn't support local memory")
|
||||
|
||||
@@ -14,7 +14,7 @@ from tinygrad.engine.realize import lower_schedule_item
|
||||
def get_stats(x:Tensor):
|
||||
si = create_schedule([x.lazydata])[-1]
|
||||
ei = lower_schedule_item(si)
|
||||
return ei.prg.p.op_estimate, ei.prg.p.mem_estimate
|
||||
return ei.prg.op_estimate, ei.prg.mem_estimate
|
||||
|
||||
class TestUOpsStats(unittest.TestCase):
|
||||
def test_simple_add(self):
|
||||
|
||||
@@ -59,9 +59,6 @@ tensor_cores: Dict[str, List[TensorCore]] = {
|
||||
"HSA": [TensorCore(dims=(16,16,16), threads=[(0,8),(0,2),(1,2)], thread_local_sizes=[[16],[16],[4,2]], thread_local_aliases=[ [[0],[0],[2],[-1],[1]], [[1],[2],[0],[-1],[0]], [[1],[2],[-2],[0],[3,-1]] ], dtype_in=di, dtype_out=do) for (di, do) in [(dtypes.half, dtypes.float), (dtypes.half, dtypes.half)]], # noqa: E501
|
||||
"CUDA": [TensorCore(dims=(8,16,16), threads=[(0,2),(0,2),(1,2),(1,2),(0,2)], thread_local_sizes=[[2,2,2],[2,2],[2,2]], thread_local_aliases=[ [[0],[0],[5],[-2],[0],[-1,1,2,-3],[3,4]], [[3],[4],[0],[0],[5],[-1,1,2,-2],[0]], [[-1],[1],[5],[-2],[2],[0],[3,4]] ], dtype_in=di, dtype_out=do) for (di, do) in ([(dtypes.half, dtypes.float)] if getenv("PTX") else [(dtypes.half, dtypes.float), (dtypes.bfloat16, dtypes.float)])], # noqa: E501
|
||||
}
|
||||
tensor_cores["AMD"] = tensor_cores["HSA"]
|
||||
tensor_cores["RHIP"] = tensor_cores["HSA"]
|
||||
tensor_cores["NV"] = tensor_cores["CUDA"]
|
||||
|
||||
class LocalBuffer(NamedTuple):
|
||||
name: str
|
||||
|
||||
@@ -1,17 +1,12 @@
|
||||
from __future__ import annotations
|
||||
import multiprocessing
|
||||
from dataclasses import dataclass, replace
|
||||
from dataclasses import dataclass
|
||||
from collections import defaultdict
|
||||
from typing import TYPE_CHECKING, List, Optional, Dict, Tuple, Any
|
||||
from typing import List, Optional, Dict, Tuple, Any
|
||||
import importlib, inspect, functools, pathlib, os, ctypes
|
||||
from tinygrad.helpers import getenv, all_int, diskcache_get, diskcache_put, DEBUG,BEAM,NOOPT, GlobalCounters, flat_mv, from_mv
|
||||
from tinygrad.shape.symbolic import Variable, sint
|
||||
from tinygrad.helpers import getenv, diskcache_get, diskcache_put, DEBUG, GlobalCounters, flat_mv, from_mv
|
||||
from tinygrad.dtype import DType, ImageDType
|
||||
from tinygrad.ops import LazyOp
|
||||
from tinygrad.renderer import Renderer, Program
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from tinygrad.codegen.linearizer import Linearizer
|
||||
from tinygrad.renderer import Renderer
|
||||
|
||||
# **************** Device ****************
|
||||
|
||||
@@ -167,18 +162,6 @@ class _MallocAllocator(LRUAllocator):
|
||||
|
||||
MallocAllocator = _MallocAllocator()
|
||||
|
||||
# **************** base Runner + helpers ****************
|
||||
|
||||
class Runner:
|
||||
def __init__(self, display_name:str, dname:str, op_estimate:sint=0, mem_estimate:sint=0):
|
||||
self.first_run, self.display_name, self.dname, self.op_estimate, self.mem_estimate = True, display_name, dname, op_estimate, mem_estimate
|
||||
@property
|
||||
def device(self): return Device[self.dname]
|
||||
def exec(self, rawbufs:List[Buffer], var_vals:Optional[Dict[Variable, int]]=None) -> Optional[float]:
|
||||
return self(rawbufs, {} if var_vals is None else var_vals)
|
||||
def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False) -> Optional[float]:
|
||||
raise NotImplementedError("override this")
|
||||
|
||||
# **************** for Compiled Devices ****************
|
||||
|
||||
class Compiler:
|
||||
@@ -190,79 +173,8 @@ class Compiler:
|
||||
if self.cachekey is not None: diskcache_put(self.cachekey, src, lib)
|
||||
return lib
|
||||
|
||||
class CompiledRunner(Runner):
|
||||
def __init__(self, p:Program, precompiled:Optional[bytes]=None):
|
||||
if DEBUG >= 4: print(p.src)
|
||||
self.p:Program = p
|
||||
self.lib:bytes = precompiled if precompiled is not None else Device[p.dname].compiler.compile_cached(p.src)
|
||||
self.clprg = Device[p.dname].runtime(p.function_name, self.lib)
|
||||
super().__init__(p.name, p.dname, p.op_estimate, p.mem_estimate)
|
||||
|
||||
def __reduce__(self): return self.__class__, (self.p, self.lib)
|
||||
|
||||
def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False) -> Optional[float]:
|
||||
global_size, local_size = self.p.launch_dims(var_vals)
|
||||
if global_size is not None and local_size is None and all_int(self.p.global_size): # type: ignore[arg-type]
|
||||
# TODO: this is copied from get_program
|
||||
from tinygrad.features.search import optimize_local_size
|
||||
local_size = optimize_local_size(self.clprg, global_size, rawbufs)
|
||||
global_size = [g//l if g%l == 0 else g/l for g,l in zip(global_size, local_size)]
|
||||
self.p = replace(self.p, global_size=global_size, local_size=local_size)
|
||||
lra = {}
|
||||
if global_size:
|
||||
lra['global_size'] = global_size
|
||||
assert len(global_size) == 3, "global size must have len 3"
|
||||
if local_size:
|
||||
lra['local_size'] = local_size
|
||||
assert len(local_size) == 3, "local size must have len 3"
|
||||
return self.clprg(*[x._buf for x in rawbufs], **lra, vals=tuple(var_vals[k] for k in self.p.vars), wait=wait)
|
||||
|
||||
method_cache: Dict[Tuple[str, Tuple[LazyOp, ...], int, bool], CompiledRunner] = {}
|
||||
logkerns, logkerns_level = open(getenv("LOGKERNS", ""), "a") if getenv("LOGKERNS", "") else None, getenv("LOGKERNS_LEVEL", 1)
|
||||
class Compiled:
|
||||
def __init__(self, device:str, allocator:Allocator, renderer:Optional[Renderer], compiler:Optional[Compiler], runtime, graph=None):
|
||||
self.dname, self.allocator, self.compiler, self.runtime, self.graph = device, allocator, compiler if compiler else Compiler(), runtime, graph
|
||||
self.renderer = renderer if renderer else Renderer()
|
||||
def synchronize(self): pass # override this in your device
|
||||
|
||||
def to_runner(self, k:Linearizer) -> CompiledRunner: return CompiledRunner(replace(k.to_program(), dname=self.dname))
|
||||
|
||||
def get_linearizer(self, *ast:LazyOp) -> Linearizer:
|
||||
if DEBUG >= 3:
|
||||
from tinygrad.features.graph import print_tree
|
||||
for op in ast: print_tree(op)
|
||||
from tinygrad.codegen.linearizer import Linearizer
|
||||
k = Linearizer(*ast, opts=self.renderer)
|
||||
k.required_optimizations()
|
||||
if not NOOPT:
|
||||
if not (used_tensor_cores:=k.apply_tensor_cores(getenv("TC", 1))): k.hand_coded_optimizations()
|
||||
if BEAM >= 1:
|
||||
from tinygrad.features.search import beam_search, time_linearizer, bufs_from_lin
|
||||
kb, k_opt = Linearizer(*ast, opts=self.renderer), k
|
||||
kb.required_optimizations()
|
||||
rawbufs = bufs_from_lin(kb, allocate=False)
|
||||
k = beam_search(kb, rawbufs, BEAM.value, bool(getenv("BEAM_ESTIMATE", 1)))
|
||||
if getenv("BEAM_COMPARE", 1):
|
||||
# TODO: move the HC/TC/BEAM compare to beam_search so it can be optionally cached which choice is better
|
||||
lins: List[Tuple[str, Linearizer]] = [(f"beam{BEAM.value}", k), (("tc" if used_tensor_cores else "hc"), k_opt)]
|
||||
if used_tensor_cores:
|
||||
lins.append(("hc", Linearizer(*ast, opts=self.renderer)))
|
||||
lins[-1][1].hand_coded_optimizations()
|
||||
timed = sorted([(nm, tk, time_linearizer(tk, rawbufs, allow_test_size=False, clear_l2=True)) for nm, tk in lins], key=lambda x: x[2])
|
||||
if DEBUG >= 1: print(" < ".join(f"{nm:6s} : {lin.colored_shape(30, dense=True)} : {tm*1e6:8.2f} us" for nm, lin, tm in timed))
|
||||
k = timed[0][1]
|
||||
if logkerns is not None and logkerns_level > 1: logkerns.writelines([f"{(lin.ast, lin.applied_opts)}\n" for (_,lin,_) in timed[1:]])
|
||||
# TODO: check the correctness inline once compare_linearizer is in core
|
||||
if logkerns is not None: logkerns.writelines([f"{(k.ast, k.applied_opts)}\n"])
|
||||
if DEBUG >= 4: print((k.ast, k.applied_opts)) # print here to show final applied_opts for all kernels instead of just in beam_search
|
||||
return k
|
||||
|
||||
def get_runner(self, *ast:LazyOp) -> CompiledRunner:
|
||||
ckey = (self.dname, ast, BEAM.value, False)
|
||||
if cret:=method_cache.get(ckey): return cret
|
||||
bkey = (self.dname.split(":")[0], ast, BEAM.value, True)
|
||||
if bret:=method_cache.get(bkey):
|
||||
method_cache[ckey] = ret = CompiledRunner(replace(bret.p, dname=self.dname), bret.lib)
|
||||
else:
|
||||
method_cache[ckey] = method_cache[bkey] = ret = self.to_runner(self.get_linearizer(*ast))
|
||||
return ret
|
||||
|
||||
@@ -4,11 +4,11 @@ import functools, itertools, collections
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.lazy import LazyBuffer
|
||||
from tinygrad.helpers import flatten, merge_dicts, DEBUG, Context, GRAPH, BEAM, getenv, all_int, GraphException, colored, JIT
|
||||
from tinygrad.device import Buffer, CompiledRunner, Compiled, Device, Runner
|
||||
from tinygrad.device import Buffer, Compiled, Device
|
||||
from tinygrad.dtype import DType
|
||||
from tinygrad.shape.shapetracker import ShapeTracker
|
||||
from tinygrad.shape.symbolic import Variable, sint
|
||||
from tinygrad.engine.realize import ExecItem, capturing, EmptyOp, ViewOp, BufferXfer
|
||||
from tinygrad.engine.realize import ExecItem, capturing, EmptyOp, ViewOp, BufferXfer, CompiledRunner, Runner
|
||||
from tinygrad.engine.memory import _internal_memory_planner
|
||||
from tinygrad.nn.state import get_parameters
|
||||
from weakref import WeakKeyDictionary
|
||||
|
||||
@@ -1,14 +1,84 @@
|
||||
from typing import List, Dict, Optional, cast, Generator
|
||||
from typing import List, Dict, Optional, cast, Generator, Tuple
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from tinygrad.helpers import colored, getenv, DEBUG, GlobalCounters, ansilen
|
||||
from tinygrad.ops import ScheduleItem, BufferOps, LoadOps
|
||||
from tinygrad.device import Runner, Device
|
||||
from tinygrad.device import Buffer
|
||||
from tinygrad.shape.symbolic import Variable, sym_infer
|
||||
from dataclasses import dataclass, replace
|
||||
from tinygrad.helpers import colored, getenv, DEBUG, GlobalCounters, ansilen, BEAM, NOOPT, all_int
|
||||
from tinygrad.ops import ScheduleItem, BufferOps, LoadOps, LazyOp
|
||||
from tinygrad.device import Device, Buffer
|
||||
from tinygrad.shape.symbolic import Variable, sym_infer, sint
|
||||
from tinygrad.renderer import Renderer, Program
|
||||
from tinygrad.codegen.linearizer import Linearizer
|
||||
|
||||
# **************** Program Creation ****************
|
||||
|
||||
logkerns, logkerns_level = open(getenv("LOGKERNS", ""), "a") if getenv("LOGKERNS", "") else None, getenv("LOGKERNS_LEVEL", 1)
|
||||
def get_linearizer(renderer:Renderer, ast:Tuple[LazyOp, ...]) -> Linearizer:
|
||||
if DEBUG >= 3:
|
||||
from tinygrad.features.graph import print_tree
|
||||
for op in ast: print_tree(op)
|
||||
k = Linearizer(*ast, opts=renderer)
|
||||
k.required_optimizations()
|
||||
if not NOOPT:
|
||||
if not (used_tensor_cores:=k.apply_tensor_cores(getenv("TC", 1))): k.hand_coded_optimizations()
|
||||
if BEAM >= 1:
|
||||
from tinygrad.features.search import beam_search, time_linearizer, bufs_from_lin
|
||||
kb, k_opt = Linearizer(*ast, opts=renderer), k
|
||||
kb.required_optimizations()
|
||||
rawbufs = bufs_from_lin(kb, allocate=False)
|
||||
k = beam_search(kb, rawbufs, BEAM.value, bool(getenv("BEAM_ESTIMATE", 1)))
|
||||
if getenv("BEAM_COMPARE", 1):
|
||||
# TODO: move the HC/TC/BEAM compare to beam_search so it can be optionally cached which choice is better
|
||||
lins: List[Tuple[str, Linearizer]] = [(f"beam{BEAM.value}", k), (("tc" if used_tensor_cores else "hc"), k_opt)]
|
||||
if used_tensor_cores:
|
||||
lins.append(("hc", Linearizer(*ast, opts=renderer)))
|
||||
lins[-1][1].hand_coded_optimizations()
|
||||
timed = sorted([(nm, tk, time_linearizer(tk, rawbufs, allow_test_size=False, clear_l2=True)) for nm, tk in lins], key=lambda x: x[2])
|
||||
if DEBUG >= 1: print(" < ".join(f"{nm:6s} : {lin.colored_shape(30, dense=True)} : {tm*1e6:8.2f} us" for nm, lin, tm in timed))
|
||||
k = timed[0][1]
|
||||
if logkerns is not None and logkerns_level > 1: logkerns.writelines([f"{(lin.ast, lin.applied_opts)}\n" for (_,lin,_) in timed[1:]])
|
||||
# TODO: check the correctness inline once compare_linearizer is in core
|
||||
if logkerns is not None: logkerns.writelines([f"{(k.ast, k.applied_opts)}\n"])
|
||||
if DEBUG >= 4: print((k.ast, k.applied_opts)) # print here to show final applied_opts for all kernels instead of just in beam_search
|
||||
return k
|
||||
|
||||
# **************** Runners ****************
|
||||
|
||||
class Runner:
|
||||
def __init__(self, display_name:str, dname:str, op_estimate:sint=0, mem_estimate:sint=0):
|
||||
self.first_run, self.display_name, self.dname, self.op_estimate, self.mem_estimate = True, display_name, dname, op_estimate, mem_estimate
|
||||
@property
|
||||
def device(self): return Device[self.dname]
|
||||
def exec(self, rawbufs:List[Buffer], var_vals:Optional[Dict[Variable, int]]=None) -> Optional[float]:
|
||||
return self(rawbufs, {} if var_vals is None else var_vals)
|
||||
def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False) -> Optional[float]:
|
||||
raise NotImplementedError("override this")
|
||||
|
||||
class CompiledRunner(Runner):
|
||||
def __init__(self, p:Program, precompiled:Optional[bytes]=None):
|
||||
if DEBUG >= 4: print(p.src)
|
||||
self.p:Program = p
|
||||
self.lib:bytes = precompiled if precompiled is not None else Device[p.dname].compiler.compile_cached(p.src)
|
||||
self.clprg = Device[p.dname].runtime(p.function_name, self.lib)
|
||||
super().__init__(p.name, p.dname, p.op_estimate, p.mem_estimate)
|
||||
|
||||
def __reduce__(self): return self.__class__, (self.p, self.lib)
|
||||
|
||||
def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False) -> Optional[float]:
|
||||
global_size, local_size = self.p.launch_dims(var_vals)
|
||||
if global_size is not None and local_size is None and all_int(self.p.global_size): # type: ignore[arg-type]
|
||||
# TODO: this is copied from get_program
|
||||
from tinygrad.features.search import optimize_local_size
|
||||
local_size = optimize_local_size(self.clprg, global_size, rawbufs)
|
||||
global_size = [g//l if g%l == 0 else g/l for g,l in zip(global_size, local_size)]
|
||||
self.p = replace(self.p, global_size=global_size, local_size=local_size)
|
||||
lra = {}
|
||||
if global_size:
|
||||
lra['global_size'] = global_size
|
||||
assert len(global_size) == 3, "global size must have len 3"
|
||||
if local_size:
|
||||
lra['local_size'] = local_size
|
||||
assert len(local_size) == 3, "local size must have len 3"
|
||||
return self.clprg(*[x._buf for x in rawbufs], **lra, vals=tuple(var_vals[k] for k in self.p.vars), wait=wait)
|
||||
|
||||
class CustomOp(Runner):
|
||||
def __init__(self, fxn):
|
||||
self.fxn = fxn
|
||||
@@ -53,6 +123,20 @@ class BufferXfer(BufferCopy):
|
||||
src.allocator.track_cross_device.add(dest.allocator.device)
|
||||
dest.allocator.transfer(dest._buf, src._buf, dest.nbytes, src_dev=src.allocator.device, dest_dev=dest.allocator.device)
|
||||
|
||||
# **************** method cache ****************
|
||||
|
||||
method_cache: Dict[Tuple[str, Tuple[LazyOp, ...], int, bool], CompiledRunner] = {}
|
||||
def get_runner(dname:str, ast:Tuple[LazyOp, ...]) -> CompiledRunner:
|
||||
ckey = (dname, ast, BEAM.value, False)
|
||||
if cret:=method_cache.get(ckey): return cret
|
||||
bkey = (dname.split(":")[0], ast, BEAM.value, True)
|
||||
if bret:=method_cache.get(bkey):
|
||||
method_cache[ckey] = ret = CompiledRunner(replace(bret.p, dname=dname), bret.lib)
|
||||
else:
|
||||
prg: Program = get_linearizer(Device[dname].renderer, ast).to_program()
|
||||
method_cache[ckey] = method_cache[bkey] = ret = CompiledRunner(replace(prg, dname=dname))
|
||||
return ret
|
||||
|
||||
# **************** lowering functions ****************
|
||||
|
||||
@dataclass(frozen=True)
|
||||
@@ -77,7 +161,7 @@ class ExecItem:
|
||||
def lower_schedule_item(si:ScheduleItem) -> ExecItem:
|
||||
assert len(set(x.device for x in si.bufs)) == 1 or si.ast[0].op is LoadOps.COPY or getenv("USE_COPY_KERNEL")
|
||||
if si.ast[0].op is BufferOps.STORE:
|
||||
runner = Device[si.outputs[0].device].get_runner(*si.ast)
|
||||
runner = get_runner(si.outputs[0].device, si.ast)
|
||||
return ExecItem(runner, [si.bufs[x[0]] for x in runner.p.globals])
|
||||
assert len(si.ast) == 1 and len(si.outputs) == 1, "only ASTRunner supports multioutput"
|
||||
out, ast = si.outputs[0], si.ast[0]
|
||||
|
||||
@@ -2,7 +2,7 @@ from typing import Dict, List, cast, DefaultDict, Optional, Tuple, Callable
|
||||
import itertools, functools, random, math, time, multiprocessing, traceback, signal
|
||||
from collections import defaultdict
|
||||
from dataclasses import replace
|
||||
from tinygrad.device import Device, Buffer, CompiledRunner, Compiler, Program
|
||||
from tinygrad.device import Device, Buffer, Compiler
|
||||
from tinygrad.ops import MemBuffer
|
||||
from tinygrad.helpers import prod, flatten, DEBUG, CACHELEVEL, diskcache_get, diskcache_put, getenv, Context, colored, to_function_name
|
||||
from tinygrad.dtype import ImageDType
|
||||
@@ -11,6 +11,8 @@ from tinygrad.codegen.kernel import Opt, OptOps, KernelOptError
|
||||
from tinygrad.codegen.uops import UOpGraph
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.shape.symbolic import sym_infer
|
||||
from tinygrad.engine.realize import CompiledRunner
|
||||
from tinygrad.renderer import Program
|
||||
|
||||
actions = [Opt(op=OptOps.UPCAST, axis=axis, amt=amt) for amt in [0,2,3,4,5,7] for axis in range(6)]
|
||||
actions += [Opt(op=OptOps.UNROLL, axis=axis, amt=amt) for amt in [0,4,7] for axis in range(4)]
|
||||
|
||||
@@ -2,8 +2,8 @@ from typing import List, Dict, cast
|
||||
import ctypes
|
||||
from tinygrad.helpers import dedup, cpu_time_execution, GraphException, DEBUG
|
||||
from tinygrad.engine.jit import GraphRunner
|
||||
from tinygrad.device import Buffer, Device, CompiledRunner
|
||||
from tinygrad.engine.realize import ExecItem
|
||||
from tinygrad.device import Buffer, Device
|
||||
from tinygrad.engine.realize import ExecItem, CompiledRunner
|
||||
from tinygrad.shape.symbolic import Variable
|
||||
from tinygrad.runtime.ops_clang import ClangProgram
|
||||
from tinygrad.renderer.cstyle import ClangRenderer
|
||||
|
||||
@@ -2,10 +2,10 @@ import ctypes
|
||||
from typing import Any, Optional, Tuple, Dict, List, cast
|
||||
import tinygrad.runtime.autogen.cuda as cuda
|
||||
from tinygrad.helpers import init_c_var, GraphException
|
||||
from tinygrad.device import CompiledRunner, Buffer, Device
|
||||
from tinygrad.device import Buffer, Device
|
||||
from tinygrad.runtime.ops_cuda import CUDADevice, check, encode_args, cu_time_execution
|
||||
from tinygrad.shape.symbolic import Variable
|
||||
from tinygrad.engine.realize import ExecItem, BufferXfer
|
||||
from tinygrad.engine.realize import ExecItem, BufferXfer, CompiledRunner
|
||||
from tinygrad.engine.jit import MultiGraphRunner
|
||||
|
||||
class CUDAGraph(MultiGraphRunner):
|
||||
|
||||
@@ -2,9 +2,9 @@ import ctypes, collections, array, time
|
||||
from typing import List, Any, Dict, cast, Optional, Tuple, Set
|
||||
from tinygrad.helpers import GraphException, round_up, to_mv
|
||||
from tinygrad.device import Buffer, BufferOptions
|
||||
from tinygrad.device import Compiled, CompiledRunner, Device
|
||||
from tinygrad.device import Compiled, Device
|
||||
from tinygrad.shape.symbolic import Variable
|
||||
from tinygrad.engine.realize import ExecItem, BufferXfer
|
||||
from tinygrad.engine.realize import ExecItem, BufferXfer, CompiledRunner
|
||||
from tinygrad.engine.jit import MultiGraphRunner
|
||||
|
||||
class HCQGraph(MultiGraphRunner):
|
||||
|
||||
@@ -2,10 +2,10 @@ import ctypes, collections, time, itertools
|
||||
from typing import List, Any, Dict, cast, Optional, Tuple
|
||||
from tinygrad.helpers import GraphException, init_c_var, round_up
|
||||
from tinygrad.device import Buffer, BufferOptions
|
||||
from tinygrad.device import Compiled, CompiledRunner, Device
|
||||
from tinygrad.device import Compiled, Device
|
||||
from tinygrad.shape.symbolic import Variable
|
||||
from tinygrad.runtime.ops_hsa import HSADevice, PROFILE, Profiler
|
||||
from tinygrad.engine.realize import ExecItem, BufferXfer
|
||||
from tinygrad.engine.realize import ExecItem, BufferXfer, CompiledRunner
|
||||
from tinygrad.engine.jit import MultiGraphRunner
|
||||
import tinygrad.runtime.autogen.hsa as hsa
|
||||
from tinygrad.runtime.driver.hsa import check, AQLQueue, AQL_PACKET_SIZE, EMPTY_SIGNAL
|
||||
|
||||
@@ -2,8 +2,8 @@ from typing import List, Any, Dict, cast, Optional
|
||||
import Metal
|
||||
from tinygrad.dtype import dtypes
|
||||
from tinygrad.helpers import dedup, unwrap2, GraphException
|
||||
from tinygrad.device import Buffer, CompiledRunner
|
||||
from tinygrad.engine.realize import ExecItem
|
||||
from tinygrad.device import Buffer
|
||||
from tinygrad.engine.realize import ExecItem, CompiledRunner
|
||||
from tinygrad.engine.jit import GraphRunner
|
||||
from tinygrad.shape.symbolic import Variable
|
||||
from tinygrad.runtime.ops_metal import wait_check
|
||||
|
||||
Reference in New Issue
Block a user