update Kernel API in tests + move optimize_local_size (#11907)

This commit is contained in:
George Hotz
2025-08-28 15:12:47 -07:00
committed by GitHub
parent fa695ac1ce
commit 394c2d1db1
8 changed files with 37 additions and 37 deletions

View File

@@ -56,7 +56,7 @@ def randoms():
def ast_to_cuda_prog(compiler, ast, opts):
k = Kernel(ast)
k.apply_opts(opts)
p = get_program(k.get_optimized_ast(), k.opts)
p = get_program(k.ast, k.opts, k.applied_opts)
return CUDAProgram(device, p.function_name, compiler.compile(p.src))
if __name__ == "__main__":

View File

@@ -29,7 +29,7 @@ if __name__ == "__main__":
Opt(op=OptOps.LOCAL, axis=0, amt=2),
]
k.apply_opts(opts)
prg = get_program(k.get_optimized_ast(), k.opts)
prg = get_program(k.ast, k.opts, k.applied_opts)
new_src = prg.src
# can mod source here
prg = replace(prg, src=new_src)

View File

@@ -58,7 +58,7 @@ if __name__ == "__main__":
GlobalCounters.kernel_count -= 1
if not getenv("NOOPT"): k.apply_opts(hand_coded_optimizations(k))
p2 = get_program(k.get_optimized_ast(), k.opts)
p2 = get_program(k.ast, k.opts, k.applied_opts)
new_ei = replace(ei, prg=CompiledRunner(p2))
new_ei.run()
new_jit.append(new_ei)

View File

@@ -24,5 +24,5 @@ if __name__ == "__main__":
#k.apply_opt(Opt(OptOps.GROUP, 1, 32))
#k.apply_opt(Opt(OptOps.GROUP, 0, 32))
from tinygrad.engine.realize import CompiledRunner, ExecItem
run = CompiledRunner(prg:=get_program(k.get_optimized_ast(), k.opts))
run = CompiledRunner(prg:=get_program(k.ast, k.opts, k.applied_opts))
ExecItem(run, si.bufs).run()

View File

@@ -35,7 +35,7 @@ k = Kernel(ast)
k.apply_opts(opts)
bufs = bufs_from_lin(k)
prg = CompiledRunner(get_program(k.get_optimized_ast(), k.opts))
prg = CompiledRunner(get_program(k.ast, k.opts, k.applied_opts))
for i in range(10):
speed = prg(bufs, var_vals={}, wait=True)

View File

@@ -418,7 +418,7 @@ class TestLinearizer(unittest.TestCase):
x, y = Tensor.rand(128, 128, dtype=tc.dtype_in), Tensor.rand(128, 128, dtype=tc.dtype_in)
r = x.matmul(y, dtype=tc.dtype_out)
k = helper_linearizer_opt(r, [[Opt(OptOps.UNROLL, 0, 4)]], apply_tc=True, atol=3e-2, rtol=1e-3)[-1]
for u in get_program(k.get_optimized_ast(), k.opts).uops:
for u in get_program(k.ast, k.opts, k.applied_opts).uops:
if u.op is Ops.WMMA:
assert u.src[-1].src[0].op != Ops.STORE
@@ -429,7 +429,7 @@ class TestLinearizer(unittest.TestCase):
x, y = Tensor.rand(128, 128, dtype=tc.dtype_in), Tensor.rand(128, 128, dtype=tc.dtype_in)
r = x.matmul(y, dtype=tc.dtype_out)
k = helper_linearizer_opt(r, [[Opt(OptOps.UNROLL, 0, 4)]], apply_tc=True, atol=3e-2, rtol=1e-3)[-1]
for u in get_program(k.get_optimized_ast(), k.opts).uops:
for u in get_program(k.ast, k.opts, k.applied_opts).uops:
if u.op is Ops.WMMA:
#assert u.src[-1].dtype == dtypes.float.vec(prod(tc.thread_local_sizes[2]))
assert u.src[-1].src[0].op != Ops.STORE
@@ -442,7 +442,7 @@ class TestLinearizer(unittest.TestCase):
x, y = Tensor.rand(128, 128, dtype=tc.dtype_in), Tensor.rand(128, 128, dtype=tc.dtype_in)
r = x.matmul(y, dtype=tc.dtype_out).relu()
k = helper_linearizer_opt(r, [[Opt(OptOps.UNROLL, 0, 4)]], apply_tc=True, atol=3e-2, rtol=1e-3)[-1]
for u in get_program(k.get_optimized_ast(), k.opts).uops:
for u in get_program(k.ast, k.opts, k.applied_opts).uops:
if u.op is Ops.WMMA:
#assert u.src[-1].dtype == dtypes.float.vec(prod(tc.thread_local_sizes[2]))
assert u.src[-1].src[0].op != Ops.STORE
@@ -453,7 +453,7 @@ class TestLinearizer(unittest.TestCase):
r = (x@y).relu()
k = helper_linearizer_opt(r, [[Opt(OptOps.UNROLL, 0, 4), Opt(OptOps.UPCAST, 0, 4)]])[-1]
# the uops graph is DEFINE_REG -> 4x STORE 0.0 -> RANGE -> 4x ALU -> 4x STORE -> ENDRANGE
uops = get_program(k.get_optimized_ast(), k.opts).uops
uops = get_program(k.ast, k.opts, k.applied_opts).uops
begin_range = [i for i, x in enumerate(uops) if x.op is Ops.RANGE][-1]
end_range = [i for i, x in enumerate(uops) if x.op is Ops.ENDRANGE][0]
for i,u in enumerate(uops): print(i, u.op, [uops.index(s) for s in u.src], u.arg, u.dtype)
@@ -544,7 +544,7 @@ class TestLinearizer(unittest.TestCase):
# shrink so that the dims do not collapse
t = Tensor.ones(5, 6, 7).contiguous().realize().shrink(((0, 4), (0, 5), (0, 6)))
k = helper_linearizer_opt(t+1)[0]
uops = get_program(k.get_optimized_ast(), k.opts).uops
uops = get_program(k.ast, k.opts, k.applied_opts).uops
idxs = dedup([uop for uop in uops if uop.op is Ops.SPECIAL])
idxs = sorted(idxs, key=lambda uop: uop.arg[0])
assert idxs[0].arg == ('gidx0', 6), idxs[0].arg
@@ -584,7 +584,7 @@ class TestLinearizer(unittest.TestCase):
def test_phi_simplification(self):
def helper(t, max_ops=0):
k = helper_linearizer_opt(t)[-1]
uops = get_program(k.get_optimized_ast(), k.opts).uops
uops = get_program(k.ast, k.opts, k.applied_opts).uops
# ignore kernel optimized IF statements for now
if if_op:=next((u for u in uops if u.op is Ops.IF), None):
uops = uops[:uops.index(if_op)]
@@ -616,7 +616,7 @@ class TestLinearizer(unittest.TestCase):
x, y = Tensor.randn(64,64), Tensor.randn(64,64)
out = x.matmul(y)
k = helper_linearizer_opt(out)[-1]
uops = get_program(k.get_optimized_ast(), k.opts).uops
uops = get_program(k.ast, k.opts, k.applied_opts).uops
# check that the float4 cast collapses
store_vals = [u.src[1] for u in uops if u.op is Ops.STORE and u.src[0].dtype.addrspace != AddrSpace.REG]
for val in store_vals:
@@ -641,7 +641,7 @@ class TestLinearizer(unittest.TestCase):
x = Tensor.randn((4,3,6,6)).realize()
out = x.flip((0,1)).contiguous()
k = helper_linearizer_opt(out)[-1]
store_val = [u.src[1] for u in get_program(k.get_optimized_ast(), k.opts).uops if u.op is Ops.STORE][0]
store_val = [u.src[1] for u in get_program(k.ast, k.opts, k.applied_opts).uops if u.op is Ops.STORE][0]
assert store_val.dtype == dtypes.float.vec(4) and store_val.op is not Ops.VECTORIZE
@unittest.skipUnless(Device[Device.DEFAULT].renderer.has_local, "test requires locals")
@@ -654,7 +654,7 @@ class TestLinearizer(unittest.TestCase):
Opt(OptOps.UNROLL, 0, 4), Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UPCAST, 1, 2)] # upcast accs in both reduces
k = helper_linearizer_opt(out, opts=[opt])[-1]
def get_recursive(uop): return set.union(set(uop.src), [uop], *[get_recursive(v) for v in uop.src])
uops = get_program(k.get_optimized_ast(), k.opts).uops
uops = get_program(k.ast, k.opts, k.applied_opts).uops
local_stores = [u for u in uops if u.op is Ops.STORE and any(x.op is Ops.DEFINE_LOCAL for x in get_recursive(u.src[0]))]
global_stores = [u for u in uops if u.op is Ops.STORE and any(x.op is Ops.DEFINE_GLOBAL for x in get_recursive(u.src[0]))]
barrier = [u for u in uops if u.op is Ops.BARRIER][0]
@@ -674,7 +674,7 @@ class TestLinearizer(unittest.TestCase):
x, y = Tensor.rand(1,128), Tensor.rand(128, 128)
r = (x@y).relu()
k = helper_linearizer_opt(r)[-1]
uops = get_program(k.get_optimized_ast(), k.opts).uops
uops = get_program(k.ast, k.opts, k.applied_opts).uops
stores = [u for u in uops if u.op is Ops.STORE and u.src[0].dtype.addrspace != AddrSpace.REG]
# the float4 value stores directly in lds and we skip upcast
@@ -700,7 +700,7 @@ class TestLinearizer(unittest.TestCase):
Opt(op=OptOps.LOCAL, axis=1, arg=2), Opt(op=OptOps.UPCAST, axis=3, arg=2)
]
k = helper_linearizer_ast(ast, [Tensor.randn(240*40).realize()], opts=[opt])[-1]
out = [u for u in get_program(k.get_optimized_ast(), k.opts).uops if u.op is Ops.STORE][0]
out = [u for u in get_program(k.ast, k.opts, k.applied_opts).uops if u.op is Ops.STORE][0]
assert out.src[1].op is Ops.VECTORIZE and out.src[1].dtype == dtypes.float.vec(4)
@unittest.skipUnless(Device[Device.DEFAULT].renderer.has_local, "test requires locals")
@@ -718,7 +718,7 @@ class TestLinearizer(unittest.TestCase):
Opt(op=OptOps.UPCAST, axis=1, arg=0), Opt(op=OptOps.UPCAST, axis=1, arg=4), Opt(op=OptOps.LOCAL, axis=0, arg=8),
Opt(op=OptOps.UPCAST, axis=1, arg=0), Opt(op=OptOps.UPCAST, axis=0, arg=2)]
k = helper_linearizer_ast(ast, [Tensor.randn(8*32).realize()], opts=[opt])[-1]
out = [u for u in get_program(k.get_optimized_ast(), k.opts).uops if u.op is Ops.STORE][0]
out = [u for u in get_program(k.ast, k.opts, k.applied_opts).uops if u.op is Ops.STORE][0]
assert out.src[1].op is Ops.VECTORIZE and out.src[1].dtype.count != 1
@unittest.skipUnless(Device[Device.DEFAULT].renderer.supports_float4, "need backends that support float4")
@@ -1049,7 +1049,7 @@ def _helper_linearizer_opt_ast(realized_ast:UOp, real_bufs:list[Buffer], opts=[]
outbufs = [real_bufs[x.src[0].base.arg] for x in realized_ast.src]
device = real_bufs[0].device
def get_prg(k:Kernel): return CompiledRunner(replace(get_program(k.get_optimized_ast(), k.opts), device=device))
def get_prg(k:Kernel): return CompiledRunner(replace(get_program(k.ast, k.opts, k.applied_opts), device=device))
def check_opt(opts, create_k, expected_color_size):
k = create_k()

View File

@@ -1,5 +1,5 @@
from typing import cast, Callable
import itertools, functools, random, math, time, multiprocessing, traceback, signal, atexit
from typing import cast
import functools, math, time, multiprocessing, traceback, signal, atexit
from collections import defaultdict
from dataclasses import replace
from tinygrad.uop.ops import UOp, Ops, Variable, sym_infer, AxisType
@@ -201,15 +201,3 @@ def beam_search(lin:Kernel, rawbufs:list[Buffer], amt:int, allow_test_size=True,
if CACHELEVEL >= 1: diskcache_put("beam_search", key, beam[0][0].applied_opts)
if BEAM_DEBUG: print(f"BEAM_SEARCH: final tm={time_to_str(beam[0][1], w=0)}, applied_opts={beam[0][0].applied_opts}")
return beam[0][0]
def optimize_local_size(_prg:Callable, global_size:list[int], rawbufs:list[Buffer]) -> list[int]:
test_rawbuffers = [Buffer(rawbufs[0].device, rawbufs[0].size, rawbufs[0].dtype).allocate(), *rawbufs[1:]] if rawbufs[0] in rawbufs[1:] else rawbufs
MAX_WORKGROUP = 1024
local_dims = [[x for x in set([sz, 1, 2, 4, 8, 16, 32, 64, 128, 256, MAX_WORKGROUP]) if x<=sz] for sz in global_size]
local_sizes = [list(x) for x in itertools.product(*local_dims) if prod(x) <= MAX_WORKGROUP] * 2 # try each valid size twice
def try_exec(local_size):
try: return _prg(*[x._buf for x in test_rawbuffers], global_size=[g//l if g%l == 0 else g/l for g,l in zip(global_size, local_size)], local_size=local_size, wait=True) # noqa: E501
except Exception: return float('inf')
ret = min([(try_exec(local_size), local_size) for local_size in random.sample(local_sizes, len(local_sizes))])
assert not math.isinf(ret[0]), "all optimize_local_size exec failed"
return ret[1]

View File

@@ -1,8 +1,8 @@
from typing import cast, Generator
import time, pprint, decimal
from typing import cast, Generator, Callable
import time, pprint, decimal, random, itertools, math
from dataclasses import dataclass, replace, field
from tinygrad.helpers import all_same, colored, DEBUG, GlobalCounters, ansilen, BEAM, NOOPT, all_int, CAPTURING, Metadata, TRACEMETA, TracingKey
from tinygrad.helpers import DEVECTORIZE, time_to_str, VALIDATE_WITH_CPU, getenv, cpu_profile, PROFILE, ProfilePointEvent, cpu_events
from tinygrad.helpers import DEVECTORIZE, time_to_str, VALIDATE_WITH_CPU, getenv, cpu_profile, PROFILE, ProfilePointEvent, cpu_events, prod
from tinygrad.uop.ops import Ops, PatternMatcher, UOp, UPat, Variable, sym_infer, graph_rewrite, print_uops, track_rewrites, KernelInfo
from tinygrad.device import Device, Buffer
from tinygrad.renderer import Renderer, ProgramSpec, Estimates
@@ -59,6 +59,20 @@ class Runner:
def __call__(self, rawbufs:list[Buffer], var_vals:dict[Variable, int], wait=False) -> float|None:
raise NotImplementedError("override this")
def optimize_local_size(_prg:Callable, global_size:list[int], rawbufs:list[Buffer]) -> list[int]:
test_rawbuffers = [Buffer(rawbufs[0].device, rawbufs[0].size, rawbufs[0].dtype).allocate(), *rawbufs[1:]] if rawbufs[0] in rawbufs[1:] else rawbufs
MAX_WORKGROUP = 1024
local_dims = [[x for x in set([sz, 1, 2, 4, 8, 16, 32, 64, 128, 256, MAX_WORKGROUP]) if x<=sz] for sz in global_size]
local_sizes = [list(x) for x in itertools.product(*local_dims) if prod(x) <= MAX_WORKGROUP] * 2 # try each valid size twice
def try_exec(local_size):
try:
return _prg(*[x._buf for x in test_rawbuffers],global_size=[g//l if g%l == 0 else g/l for g,l in zip(global_size, local_size)],
local_size=local_size, wait=True)
except Exception: return float('inf')
ret = min([(try_exec(local_size), local_size) for local_size in random.sample(local_sizes, len(local_sizes))])
assert not math.isinf(ret[0]), "all optimize_local_size exec failed"
return ret[1]
class CompiledRunner(Runner):
def __init__(self, p:ProgramSpec, precompiled:bytes|None=None, prg=None):
if DEBUG >= 4: print(p.src)
@@ -76,8 +90,6 @@ class CompiledRunner(Runner):
def __call__(self, rawbufs:list[Buffer], var_vals:dict[Variable, int], wait=False) -> float|None:
global_size, local_size = self.p.launch_dims(var_vals)
if global_size is not None and local_size is None and all_int(self.p.global_size): # type: ignore[arg-type]
# TODO: this is copied from get_program
from tinygrad.codegen.opt.search import optimize_local_size
local_size = optimize_local_size(self._prg, global_size, rawbufs)
global_size = [g//l if g%l == 0 else g/l for g,l in zip(global_size, local_size)]
self.p = replace(self.p, global_size=global_size, local_size=local_size)