mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-06-13 00:15:35 +08:00
update Kernel API in tests + move optimize_local_size (#11907)
This commit is contained in:
@@ -56,7 +56,7 @@ def randoms():
|
||||
def ast_to_cuda_prog(compiler, ast, opts):
|
||||
k = Kernel(ast)
|
||||
k.apply_opts(opts)
|
||||
p = get_program(k.get_optimized_ast(), k.opts)
|
||||
p = get_program(k.ast, k.opts, k.applied_opts)
|
||||
return CUDAProgram(device, p.function_name, compiler.compile(p.src))
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -29,7 +29,7 @@ if __name__ == "__main__":
|
||||
Opt(op=OptOps.LOCAL, axis=0, amt=2),
|
||||
]
|
||||
k.apply_opts(opts)
|
||||
prg = get_program(k.get_optimized_ast(), k.opts)
|
||||
prg = get_program(k.ast, k.opts, k.applied_opts)
|
||||
new_src = prg.src
|
||||
# can mod source here
|
||||
prg = replace(prg, src=new_src)
|
||||
|
||||
@@ -58,7 +58,7 @@ if __name__ == "__main__":
|
||||
GlobalCounters.kernel_count -= 1
|
||||
|
||||
if not getenv("NOOPT"): k.apply_opts(hand_coded_optimizations(k))
|
||||
p2 = get_program(k.get_optimized_ast(), k.opts)
|
||||
p2 = get_program(k.ast, k.opts, k.applied_opts)
|
||||
new_ei = replace(ei, prg=CompiledRunner(p2))
|
||||
new_ei.run()
|
||||
new_jit.append(new_ei)
|
||||
|
||||
@@ -24,5 +24,5 @@ if __name__ == "__main__":
|
||||
#k.apply_opt(Opt(OptOps.GROUP, 1, 32))
|
||||
#k.apply_opt(Opt(OptOps.GROUP, 0, 32))
|
||||
from tinygrad.engine.realize import CompiledRunner, ExecItem
|
||||
run = CompiledRunner(prg:=get_program(k.get_optimized_ast(), k.opts))
|
||||
run = CompiledRunner(prg:=get_program(k.ast, k.opts, k.applied_opts))
|
||||
ExecItem(run, si.bufs).run()
|
||||
|
||||
@@ -35,7 +35,7 @@ k = Kernel(ast)
|
||||
k.apply_opts(opts)
|
||||
bufs = bufs_from_lin(k)
|
||||
|
||||
prg = CompiledRunner(get_program(k.get_optimized_ast(), k.opts))
|
||||
prg = CompiledRunner(get_program(k.ast, k.opts, k.applied_opts))
|
||||
|
||||
for i in range(10):
|
||||
speed = prg(bufs, var_vals={}, wait=True)
|
||||
|
||||
@@ -418,7 +418,7 @@ class TestLinearizer(unittest.TestCase):
|
||||
x, y = Tensor.rand(128, 128, dtype=tc.dtype_in), Tensor.rand(128, 128, dtype=tc.dtype_in)
|
||||
r = x.matmul(y, dtype=tc.dtype_out)
|
||||
k = helper_linearizer_opt(r, [[Opt(OptOps.UNROLL, 0, 4)]], apply_tc=True, atol=3e-2, rtol=1e-3)[-1]
|
||||
for u in get_program(k.get_optimized_ast(), k.opts).uops:
|
||||
for u in get_program(k.ast, k.opts, k.applied_opts).uops:
|
||||
if u.op is Ops.WMMA:
|
||||
assert u.src[-1].src[0].op != Ops.STORE
|
||||
|
||||
@@ -429,7 +429,7 @@ class TestLinearizer(unittest.TestCase):
|
||||
x, y = Tensor.rand(128, 128, dtype=tc.dtype_in), Tensor.rand(128, 128, dtype=tc.dtype_in)
|
||||
r = x.matmul(y, dtype=tc.dtype_out)
|
||||
k = helper_linearizer_opt(r, [[Opt(OptOps.UNROLL, 0, 4)]], apply_tc=True, atol=3e-2, rtol=1e-3)[-1]
|
||||
for u in get_program(k.get_optimized_ast(), k.opts).uops:
|
||||
for u in get_program(k.ast, k.opts, k.applied_opts).uops:
|
||||
if u.op is Ops.WMMA:
|
||||
#assert u.src[-1].dtype == dtypes.float.vec(prod(tc.thread_local_sizes[2]))
|
||||
assert u.src[-1].src[0].op != Ops.STORE
|
||||
@@ -442,7 +442,7 @@ class TestLinearizer(unittest.TestCase):
|
||||
x, y = Tensor.rand(128, 128, dtype=tc.dtype_in), Tensor.rand(128, 128, dtype=tc.dtype_in)
|
||||
r = x.matmul(y, dtype=tc.dtype_out).relu()
|
||||
k = helper_linearizer_opt(r, [[Opt(OptOps.UNROLL, 0, 4)]], apply_tc=True, atol=3e-2, rtol=1e-3)[-1]
|
||||
for u in get_program(k.get_optimized_ast(), k.opts).uops:
|
||||
for u in get_program(k.ast, k.opts, k.applied_opts).uops:
|
||||
if u.op is Ops.WMMA:
|
||||
#assert u.src[-1].dtype == dtypes.float.vec(prod(tc.thread_local_sizes[2]))
|
||||
assert u.src[-1].src[0].op != Ops.STORE
|
||||
@@ -453,7 +453,7 @@ class TestLinearizer(unittest.TestCase):
|
||||
r = (x@y).relu()
|
||||
k = helper_linearizer_opt(r, [[Opt(OptOps.UNROLL, 0, 4), Opt(OptOps.UPCAST, 0, 4)]])[-1]
|
||||
# the uops graph is DEFINE_REG -> 4x STORE 0.0 -> RANGE -> 4x ALU -> 4x STORE -> ENDRANGE
|
||||
uops = get_program(k.get_optimized_ast(), k.opts).uops
|
||||
uops = get_program(k.ast, k.opts, k.applied_opts).uops
|
||||
begin_range = [i for i, x in enumerate(uops) if x.op is Ops.RANGE][-1]
|
||||
end_range = [i for i, x in enumerate(uops) if x.op is Ops.ENDRANGE][0]
|
||||
for i,u in enumerate(uops): print(i, u.op, [uops.index(s) for s in u.src], u.arg, u.dtype)
|
||||
@@ -544,7 +544,7 @@ class TestLinearizer(unittest.TestCase):
|
||||
# shrink so that the dims do not collapse
|
||||
t = Tensor.ones(5, 6, 7).contiguous().realize().shrink(((0, 4), (0, 5), (0, 6)))
|
||||
k = helper_linearizer_opt(t+1)[0]
|
||||
uops = get_program(k.get_optimized_ast(), k.opts).uops
|
||||
uops = get_program(k.ast, k.opts, k.applied_opts).uops
|
||||
idxs = dedup([uop for uop in uops if uop.op is Ops.SPECIAL])
|
||||
idxs = sorted(idxs, key=lambda uop: uop.arg[0])
|
||||
assert idxs[0].arg == ('gidx0', 6), idxs[0].arg
|
||||
@@ -584,7 +584,7 @@ class TestLinearizer(unittest.TestCase):
|
||||
def test_phi_simplification(self):
|
||||
def helper(t, max_ops=0):
|
||||
k = helper_linearizer_opt(t)[-1]
|
||||
uops = get_program(k.get_optimized_ast(), k.opts).uops
|
||||
uops = get_program(k.ast, k.opts, k.applied_opts).uops
|
||||
# ignore kernel optimized IF statements for now
|
||||
if if_op:=next((u for u in uops if u.op is Ops.IF), None):
|
||||
uops = uops[:uops.index(if_op)]
|
||||
@@ -616,7 +616,7 @@ class TestLinearizer(unittest.TestCase):
|
||||
x, y = Tensor.randn(64,64), Tensor.randn(64,64)
|
||||
out = x.matmul(y)
|
||||
k = helper_linearizer_opt(out)[-1]
|
||||
uops = get_program(k.get_optimized_ast(), k.opts).uops
|
||||
uops = get_program(k.ast, k.opts, k.applied_opts).uops
|
||||
# check that the float4 cast collapses
|
||||
store_vals = [u.src[1] for u in uops if u.op is Ops.STORE and u.src[0].dtype.addrspace != AddrSpace.REG]
|
||||
for val in store_vals:
|
||||
@@ -641,7 +641,7 @@ class TestLinearizer(unittest.TestCase):
|
||||
x = Tensor.randn((4,3,6,6)).realize()
|
||||
out = x.flip((0,1)).contiguous()
|
||||
k = helper_linearizer_opt(out)[-1]
|
||||
store_val = [u.src[1] for u in get_program(k.get_optimized_ast(), k.opts).uops if u.op is Ops.STORE][0]
|
||||
store_val = [u.src[1] for u in get_program(k.ast, k.opts, k.applied_opts).uops if u.op is Ops.STORE][0]
|
||||
assert store_val.dtype == dtypes.float.vec(4) and store_val.op is not Ops.VECTORIZE
|
||||
|
||||
@unittest.skipUnless(Device[Device.DEFAULT].renderer.has_local, "test requires locals")
|
||||
@@ -654,7 +654,7 @@ class TestLinearizer(unittest.TestCase):
|
||||
Opt(OptOps.UNROLL, 0, 4), Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UPCAST, 1, 2)] # upcast accs in both reduces
|
||||
k = helper_linearizer_opt(out, opts=[opt])[-1]
|
||||
def get_recursive(uop): return set.union(set(uop.src), [uop], *[get_recursive(v) for v in uop.src])
|
||||
uops = get_program(k.get_optimized_ast(), k.opts).uops
|
||||
uops = get_program(k.ast, k.opts, k.applied_opts).uops
|
||||
local_stores = [u for u in uops if u.op is Ops.STORE and any(x.op is Ops.DEFINE_LOCAL for x in get_recursive(u.src[0]))]
|
||||
global_stores = [u for u in uops if u.op is Ops.STORE and any(x.op is Ops.DEFINE_GLOBAL for x in get_recursive(u.src[0]))]
|
||||
barrier = [u for u in uops if u.op is Ops.BARRIER][0]
|
||||
@@ -674,7 +674,7 @@ class TestLinearizer(unittest.TestCase):
|
||||
x, y = Tensor.rand(1,128), Tensor.rand(128, 128)
|
||||
r = (x@y).relu()
|
||||
k = helper_linearizer_opt(r)[-1]
|
||||
uops = get_program(k.get_optimized_ast(), k.opts).uops
|
||||
uops = get_program(k.ast, k.opts, k.applied_opts).uops
|
||||
stores = [u for u in uops if u.op is Ops.STORE and u.src[0].dtype.addrspace != AddrSpace.REG]
|
||||
|
||||
# the float4 value stores directly in lds and we skip upcast
|
||||
@@ -700,7 +700,7 @@ class TestLinearizer(unittest.TestCase):
|
||||
Opt(op=OptOps.LOCAL, axis=1, arg=2), Opt(op=OptOps.UPCAST, axis=3, arg=2)
|
||||
]
|
||||
k = helper_linearizer_ast(ast, [Tensor.randn(240*40).realize()], opts=[opt])[-1]
|
||||
out = [u for u in get_program(k.get_optimized_ast(), k.opts).uops if u.op is Ops.STORE][0]
|
||||
out = [u for u in get_program(k.ast, k.opts, k.applied_opts).uops if u.op is Ops.STORE][0]
|
||||
assert out.src[1].op is Ops.VECTORIZE and out.src[1].dtype == dtypes.float.vec(4)
|
||||
|
||||
@unittest.skipUnless(Device[Device.DEFAULT].renderer.has_local, "test requires locals")
|
||||
@@ -718,7 +718,7 @@ class TestLinearizer(unittest.TestCase):
|
||||
Opt(op=OptOps.UPCAST, axis=1, arg=0), Opt(op=OptOps.UPCAST, axis=1, arg=4), Opt(op=OptOps.LOCAL, axis=0, arg=8),
|
||||
Opt(op=OptOps.UPCAST, axis=1, arg=0), Opt(op=OptOps.UPCAST, axis=0, arg=2)]
|
||||
k = helper_linearizer_ast(ast, [Tensor.randn(8*32).realize()], opts=[opt])[-1]
|
||||
out = [u for u in get_program(k.get_optimized_ast(), k.opts).uops if u.op is Ops.STORE][0]
|
||||
out = [u for u in get_program(k.ast, k.opts, k.applied_opts).uops if u.op is Ops.STORE][0]
|
||||
assert out.src[1].op is Ops.VECTORIZE and out.src[1].dtype.count != 1
|
||||
|
||||
@unittest.skipUnless(Device[Device.DEFAULT].renderer.supports_float4, "need backends that support float4")
|
||||
@@ -1049,7 +1049,7 @@ def _helper_linearizer_opt_ast(realized_ast:UOp, real_bufs:list[Buffer], opts=[]
|
||||
outbufs = [real_bufs[x.src[0].base.arg] for x in realized_ast.src]
|
||||
device = real_bufs[0].device
|
||||
|
||||
def get_prg(k:Kernel): return CompiledRunner(replace(get_program(k.get_optimized_ast(), k.opts), device=device))
|
||||
def get_prg(k:Kernel): return CompiledRunner(replace(get_program(k.ast, k.opts, k.applied_opts), device=device))
|
||||
|
||||
def check_opt(opts, create_k, expected_color_size):
|
||||
k = create_k()
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
from typing import cast, Callable
|
||||
import itertools, functools, random, math, time, multiprocessing, traceback, signal, atexit
|
||||
from typing import cast
|
||||
import functools, math, time, multiprocessing, traceback, signal, atexit
|
||||
from collections import defaultdict
|
||||
from dataclasses import replace
|
||||
from tinygrad.uop.ops import UOp, Ops, Variable, sym_infer, AxisType
|
||||
@@ -201,15 +201,3 @@ def beam_search(lin:Kernel, rawbufs:list[Buffer], amt:int, allow_test_size=True,
|
||||
if CACHELEVEL >= 1: diskcache_put("beam_search", key, beam[0][0].applied_opts)
|
||||
if BEAM_DEBUG: print(f"BEAM_SEARCH: final tm={time_to_str(beam[0][1], w=0)}, applied_opts={beam[0][0].applied_opts}")
|
||||
return beam[0][0]
|
||||
|
||||
def optimize_local_size(_prg:Callable, global_size:list[int], rawbufs:list[Buffer]) -> list[int]:
|
||||
test_rawbuffers = [Buffer(rawbufs[0].device, rawbufs[0].size, rawbufs[0].dtype).allocate(), *rawbufs[1:]] if rawbufs[0] in rawbufs[1:] else rawbufs
|
||||
MAX_WORKGROUP = 1024
|
||||
local_dims = [[x for x in set([sz, 1, 2, 4, 8, 16, 32, 64, 128, 256, MAX_WORKGROUP]) if x<=sz] for sz in global_size]
|
||||
local_sizes = [list(x) for x in itertools.product(*local_dims) if prod(x) <= MAX_WORKGROUP] * 2 # try each valid size twice
|
||||
def try_exec(local_size):
|
||||
try: return _prg(*[x._buf for x in test_rawbuffers], global_size=[g//l if g%l == 0 else g/l for g,l in zip(global_size, local_size)], local_size=local_size, wait=True) # noqa: E501
|
||||
except Exception: return float('inf')
|
||||
ret = min([(try_exec(local_size), local_size) for local_size in random.sample(local_sizes, len(local_sizes))])
|
||||
assert not math.isinf(ret[0]), "all optimize_local_size exec failed"
|
||||
return ret[1]
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
from typing import cast, Generator
|
||||
import time, pprint, decimal
|
||||
from typing import cast, Generator, Callable
|
||||
import time, pprint, decimal, random, itertools, math
|
||||
from dataclasses import dataclass, replace, field
|
||||
from tinygrad.helpers import all_same, colored, DEBUG, GlobalCounters, ansilen, BEAM, NOOPT, all_int, CAPTURING, Metadata, TRACEMETA, TracingKey
|
||||
from tinygrad.helpers import DEVECTORIZE, time_to_str, VALIDATE_WITH_CPU, getenv, cpu_profile, PROFILE, ProfilePointEvent, cpu_events
|
||||
from tinygrad.helpers import DEVECTORIZE, time_to_str, VALIDATE_WITH_CPU, getenv, cpu_profile, PROFILE, ProfilePointEvent, cpu_events, prod
|
||||
from tinygrad.uop.ops import Ops, PatternMatcher, UOp, UPat, Variable, sym_infer, graph_rewrite, print_uops, track_rewrites, KernelInfo
|
||||
from tinygrad.device import Device, Buffer
|
||||
from tinygrad.renderer import Renderer, ProgramSpec, Estimates
|
||||
@@ -59,6 +59,20 @@ class Runner:
|
||||
def __call__(self, rawbufs:list[Buffer], var_vals:dict[Variable, int], wait=False) -> float|None:
|
||||
raise NotImplementedError("override this")
|
||||
|
||||
def optimize_local_size(_prg:Callable, global_size:list[int], rawbufs:list[Buffer]) -> list[int]:
|
||||
test_rawbuffers = [Buffer(rawbufs[0].device, rawbufs[0].size, rawbufs[0].dtype).allocate(), *rawbufs[1:]] if rawbufs[0] in rawbufs[1:] else rawbufs
|
||||
MAX_WORKGROUP = 1024
|
||||
local_dims = [[x for x in set([sz, 1, 2, 4, 8, 16, 32, 64, 128, 256, MAX_WORKGROUP]) if x<=sz] for sz in global_size]
|
||||
local_sizes = [list(x) for x in itertools.product(*local_dims) if prod(x) <= MAX_WORKGROUP] * 2 # try each valid size twice
|
||||
def try_exec(local_size):
|
||||
try:
|
||||
return _prg(*[x._buf for x in test_rawbuffers],global_size=[g//l if g%l == 0 else g/l for g,l in zip(global_size, local_size)],
|
||||
local_size=local_size, wait=True)
|
||||
except Exception: return float('inf')
|
||||
ret = min([(try_exec(local_size), local_size) for local_size in random.sample(local_sizes, len(local_sizes))])
|
||||
assert not math.isinf(ret[0]), "all optimize_local_size exec failed"
|
||||
return ret[1]
|
||||
|
||||
class CompiledRunner(Runner):
|
||||
def __init__(self, p:ProgramSpec, precompiled:bytes|None=None, prg=None):
|
||||
if DEBUG >= 4: print(p.src)
|
||||
@@ -76,8 +90,6 @@ class CompiledRunner(Runner):
|
||||
def __call__(self, rawbufs:list[Buffer], var_vals:dict[Variable, int], wait=False) -> float|None:
|
||||
global_size, local_size = self.p.launch_dims(var_vals)
|
||||
if global_size is not None and local_size is None and all_int(self.p.global_size): # type: ignore[arg-type]
|
||||
# TODO: this is copied from get_program
|
||||
from tinygrad.codegen.opt.search import optimize_local_size
|
||||
local_size = optimize_local_size(self._prg, global_size, rawbufs)
|
||||
global_size = [g//l if g%l == 0 else g/l for g,l in zip(global_size, local_size)]
|
||||
self.p = replace(self.p, global_size=global_size, local_size=local_size)
|
||||
|
||||
Reference in New Issue
Block a user