From 394c2d1db114dcbe80262f8ddb76c3b5947af7bf Mon Sep 17 00:00:00 2001 From: George Hotz <72895+geohot@users.noreply.github.com> Date: Thu, 28 Aug 2025 15:12:47 -0700 Subject: [PATCH] update Kernel API in tests + move optimize_local_size (#11907) --- extra/gemm/max_matmul.py | 2 +- extra/gemm/tinygrad_nv_matmul.py | 2 +- extra/replay_pkl.py | 2 +- .../external_benchmark_sdxl_softmax.py | 2 +- test/external/external_debug_metal_sd_conv.py | 2 +- test/test_linearizer.py | 26 +++++++++---------- tinygrad/codegen/opt/search.py | 16 ++---------- tinygrad/engine/realize.py | 22 ++++++++++++---- 8 files changed, 37 insertions(+), 37 deletions(-) diff --git a/extra/gemm/max_matmul.py b/extra/gemm/max_matmul.py index c1c376ea83..6937b7d153 100644 --- a/extra/gemm/max_matmul.py +++ b/extra/gemm/max_matmul.py @@ -56,7 +56,7 @@ def randoms(): def ast_to_cuda_prog(compiler, ast, opts): k = Kernel(ast) k.apply_opts(opts) - p = get_program(k.get_optimized_ast(), k.opts) + p = get_program(k.ast, k.opts, k.applied_opts) return CUDAProgram(device, p.function_name, compiler.compile(p.src)) if __name__ == "__main__": diff --git a/extra/gemm/tinygrad_nv_matmul.py b/extra/gemm/tinygrad_nv_matmul.py index 8fff78ea20..1ee3e72e15 100644 --- a/extra/gemm/tinygrad_nv_matmul.py +++ b/extra/gemm/tinygrad_nv_matmul.py @@ -29,7 +29,7 @@ if __name__ == "__main__": Opt(op=OptOps.LOCAL, axis=0, amt=2), ] k.apply_opts(opts) - prg = get_program(k.get_optimized_ast(), k.opts) + prg = get_program(k.ast, k.opts, k.applied_opts) new_src = prg.src # can mod source here prg = replace(prg, src=new_src) diff --git a/extra/replay_pkl.py b/extra/replay_pkl.py index a1456e7125..e4cb5ed543 100644 --- a/extra/replay_pkl.py +++ b/extra/replay_pkl.py @@ -58,7 +58,7 @@ if __name__ == "__main__": GlobalCounters.kernel_count -= 1 if not getenv("NOOPT"): k.apply_opts(hand_coded_optimizations(k)) - p2 = get_program(k.get_optimized_ast(), k.opts) + p2 = get_program(k.ast, k.opts, k.applied_opts) new_ei = replace(ei, prg=CompiledRunner(p2)) new_ei.run() new_jit.append(new_ei) diff --git a/test/external/external_benchmark_sdxl_softmax.py b/test/external/external_benchmark_sdxl_softmax.py index 40ee746810..19837ac9b2 100644 --- a/test/external/external_benchmark_sdxl_softmax.py +++ b/test/external/external_benchmark_sdxl_softmax.py @@ -24,5 +24,5 @@ if __name__ == "__main__": #k.apply_opt(Opt(OptOps.GROUP, 1, 32)) #k.apply_opt(Opt(OptOps.GROUP, 0, 32)) from tinygrad.engine.realize import CompiledRunner, ExecItem - run = CompiledRunner(prg:=get_program(k.get_optimized_ast(), k.opts)) + run = CompiledRunner(prg:=get_program(k.ast, k.opts, k.applied_opts)) ExecItem(run, si.bufs).run() diff --git a/test/external/external_debug_metal_sd_conv.py b/test/external/external_debug_metal_sd_conv.py index afb8ef296b..e13c6a4857 100644 --- a/test/external/external_debug_metal_sd_conv.py +++ b/test/external/external_debug_metal_sd_conv.py @@ -35,7 +35,7 @@ k = Kernel(ast) k.apply_opts(opts) bufs = bufs_from_lin(k) -prg = CompiledRunner(get_program(k.get_optimized_ast(), k.opts)) +prg = CompiledRunner(get_program(k.ast, k.opts, k.applied_opts)) for i in range(10): speed = prg(bufs, var_vals={}, wait=True) diff --git a/test/test_linearizer.py b/test/test_linearizer.py index 9c4198ba6b..f6c3c446bc 100644 --- a/test/test_linearizer.py +++ b/test/test_linearizer.py @@ -418,7 +418,7 @@ class TestLinearizer(unittest.TestCase): x, y = Tensor.rand(128, 128, dtype=tc.dtype_in), Tensor.rand(128, 128, dtype=tc.dtype_in) r = x.matmul(y, dtype=tc.dtype_out) k = helper_linearizer_opt(r, [[Opt(OptOps.UNROLL, 0, 4)]], apply_tc=True, atol=3e-2, rtol=1e-3)[-1] - for u in get_program(k.get_optimized_ast(), k.opts).uops: + for u in get_program(k.ast, k.opts, k.applied_opts).uops: if u.op is Ops.WMMA: assert u.src[-1].src[0].op != Ops.STORE @@ -429,7 +429,7 @@ class TestLinearizer(unittest.TestCase): x, y = Tensor.rand(128, 128, dtype=tc.dtype_in), Tensor.rand(128, 128, dtype=tc.dtype_in) r = x.matmul(y, dtype=tc.dtype_out) k = helper_linearizer_opt(r, [[Opt(OptOps.UNROLL, 0, 4)]], apply_tc=True, atol=3e-2, rtol=1e-3)[-1] - for u in get_program(k.get_optimized_ast(), k.opts).uops: + for u in get_program(k.ast, k.opts, k.applied_opts).uops: if u.op is Ops.WMMA: #assert u.src[-1].dtype == dtypes.float.vec(prod(tc.thread_local_sizes[2])) assert u.src[-1].src[0].op != Ops.STORE @@ -442,7 +442,7 @@ class TestLinearizer(unittest.TestCase): x, y = Tensor.rand(128, 128, dtype=tc.dtype_in), Tensor.rand(128, 128, dtype=tc.dtype_in) r = x.matmul(y, dtype=tc.dtype_out).relu() k = helper_linearizer_opt(r, [[Opt(OptOps.UNROLL, 0, 4)]], apply_tc=True, atol=3e-2, rtol=1e-3)[-1] - for u in get_program(k.get_optimized_ast(), k.opts).uops: + for u in get_program(k.ast, k.opts, k.applied_opts).uops: if u.op is Ops.WMMA: #assert u.src[-1].dtype == dtypes.float.vec(prod(tc.thread_local_sizes[2])) assert u.src[-1].src[0].op != Ops.STORE @@ -453,7 +453,7 @@ class TestLinearizer(unittest.TestCase): r = (x@y).relu() k = helper_linearizer_opt(r, [[Opt(OptOps.UNROLL, 0, 4), Opt(OptOps.UPCAST, 0, 4)]])[-1] # the uops graph is DEFINE_REG -> 4x STORE 0.0 -> RANGE -> 4x ALU -> 4x STORE -> ENDRANGE - uops = get_program(k.get_optimized_ast(), k.opts).uops + uops = get_program(k.ast, k.opts, k.applied_opts).uops begin_range = [i for i, x in enumerate(uops) if x.op is Ops.RANGE][-1] end_range = [i for i, x in enumerate(uops) if x.op is Ops.ENDRANGE][0] for i,u in enumerate(uops): print(i, u.op, [uops.index(s) for s in u.src], u.arg, u.dtype) @@ -544,7 +544,7 @@ class TestLinearizer(unittest.TestCase): # shrink so that the dims do not collapse t = Tensor.ones(5, 6, 7).contiguous().realize().shrink(((0, 4), (0, 5), (0, 6))) k = helper_linearizer_opt(t+1)[0] - uops = get_program(k.get_optimized_ast(), k.opts).uops + uops = get_program(k.ast, k.opts, k.applied_opts).uops idxs = dedup([uop for uop in uops if uop.op is Ops.SPECIAL]) idxs = sorted(idxs, key=lambda uop: uop.arg[0]) assert idxs[0].arg == ('gidx0', 6), idxs[0].arg @@ -584,7 +584,7 @@ class TestLinearizer(unittest.TestCase): def test_phi_simplification(self): def helper(t, max_ops=0): k = helper_linearizer_opt(t)[-1] - uops = get_program(k.get_optimized_ast(), k.opts).uops + uops = get_program(k.ast, k.opts, k.applied_opts).uops # ignore kernel optimized IF statements for now if if_op:=next((u for u in uops if u.op is Ops.IF), None): uops = uops[:uops.index(if_op)] @@ -616,7 +616,7 @@ class TestLinearizer(unittest.TestCase): x, y = Tensor.randn(64,64), Tensor.randn(64,64) out = x.matmul(y) k = helper_linearizer_opt(out)[-1] - uops = get_program(k.get_optimized_ast(), k.opts).uops + uops = get_program(k.ast, k.opts, k.applied_opts).uops # check that the float4 cast collapses store_vals = [u.src[1] for u in uops if u.op is Ops.STORE and u.src[0].dtype.addrspace != AddrSpace.REG] for val in store_vals: @@ -641,7 +641,7 @@ class TestLinearizer(unittest.TestCase): x = Tensor.randn((4,3,6,6)).realize() out = x.flip((0,1)).contiguous() k = helper_linearizer_opt(out)[-1] - store_val = [u.src[1] for u in get_program(k.get_optimized_ast(), k.opts).uops if u.op is Ops.STORE][0] + store_val = [u.src[1] for u in get_program(k.ast, k.opts, k.applied_opts).uops if u.op is Ops.STORE][0] assert store_val.dtype == dtypes.float.vec(4) and store_val.op is not Ops.VECTORIZE @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_local, "test requires locals") @@ -654,7 +654,7 @@ class TestLinearizer(unittest.TestCase): Opt(OptOps.UNROLL, 0, 4), Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UPCAST, 1, 2)] # upcast accs in both reduces k = helper_linearizer_opt(out, opts=[opt])[-1] def get_recursive(uop): return set.union(set(uop.src), [uop], *[get_recursive(v) for v in uop.src]) - uops = get_program(k.get_optimized_ast(), k.opts).uops + uops = get_program(k.ast, k.opts, k.applied_opts).uops local_stores = [u for u in uops if u.op is Ops.STORE and any(x.op is Ops.DEFINE_LOCAL for x in get_recursive(u.src[0]))] global_stores = [u for u in uops if u.op is Ops.STORE and any(x.op is Ops.DEFINE_GLOBAL for x in get_recursive(u.src[0]))] barrier = [u for u in uops if u.op is Ops.BARRIER][0] @@ -674,7 +674,7 @@ class TestLinearizer(unittest.TestCase): x, y = Tensor.rand(1,128), Tensor.rand(128, 128) r = (x@y).relu() k = helper_linearizer_opt(r)[-1] - uops = get_program(k.get_optimized_ast(), k.opts).uops + uops = get_program(k.ast, k.opts, k.applied_opts).uops stores = [u for u in uops if u.op is Ops.STORE and u.src[0].dtype.addrspace != AddrSpace.REG] # the float4 value stores directly in lds and we skip upcast @@ -700,7 +700,7 @@ class TestLinearizer(unittest.TestCase): Opt(op=OptOps.LOCAL, axis=1, arg=2), Opt(op=OptOps.UPCAST, axis=3, arg=2) ] k = helper_linearizer_ast(ast, [Tensor.randn(240*40).realize()], opts=[opt])[-1] - out = [u for u in get_program(k.get_optimized_ast(), k.opts).uops if u.op is Ops.STORE][0] + out = [u for u in get_program(k.ast, k.opts, k.applied_opts).uops if u.op is Ops.STORE][0] assert out.src[1].op is Ops.VECTORIZE and out.src[1].dtype == dtypes.float.vec(4) @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_local, "test requires locals") @@ -718,7 +718,7 @@ class TestLinearizer(unittest.TestCase): Opt(op=OptOps.UPCAST, axis=1, arg=0), Opt(op=OptOps.UPCAST, axis=1, arg=4), Opt(op=OptOps.LOCAL, axis=0, arg=8), Opt(op=OptOps.UPCAST, axis=1, arg=0), Opt(op=OptOps.UPCAST, axis=0, arg=2)] k = helper_linearizer_ast(ast, [Tensor.randn(8*32).realize()], opts=[opt])[-1] - out = [u for u in get_program(k.get_optimized_ast(), k.opts).uops if u.op is Ops.STORE][0] + out = [u for u in get_program(k.ast, k.opts, k.applied_opts).uops if u.op is Ops.STORE][0] assert out.src[1].op is Ops.VECTORIZE and out.src[1].dtype.count != 1 @unittest.skipUnless(Device[Device.DEFAULT].renderer.supports_float4, "need backends that support float4") @@ -1049,7 +1049,7 @@ def _helper_linearizer_opt_ast(realized_ast:UOp, real_bufs:list[Buffer], opts=[] outbufs = [real_bufs[x.src[0].base.arg] for x in realized_ast.src] device = real_bufs[0].device - def get_prg(k:Kernel): return CompiledRunner(replace(get_program(k.get_optimized_ast(), k.opts), device=device)) + def get_prg(k:Kernel): return CompiledRunner(replace(get_program(k.ast, k.opts, k.applied_opts), device=device)) def check_opt(opts, create_k, expected_color_size): k = create_k() diff --git a/tinygrad/codegen/opt/search.py b/tinygrad/codegen/opt/search.py index 8db9c8c9d8..0ae91a43c1 100644 --- a/tinygrad/codegen/opt/search.py +++ b/tinygrad/codegen/opt/search.py @@ -1,5 +1,5 @@ -from typing import cast, Callable -import itertools, functools, random, math, time, multiprocessing, traceback, signal, atexit +from typing import cast +import functools, math, time, multiprocessing, traceback, signal, atexit from collections import defaultdict from dataclasses import replace from tinygrad.uop.ops import UOp, Ops, Variable, sym_infer, AxisType @@ -201,15 +201,3 @@ def beam_search(lin:Kernel, rawbufs:list[Buffer], amt:int, allow_test_size=True, if CACHELEVEL >= 1: diskcache_put("beam_search", key, beam[0][0].applied_opts) if BEAM_DEBUG: print(f"BEAM_SEARCH: final tm={time_to_str(beam[0][1], w=0)}, applied_opts={beam[0][0].applied_opts}") return beam[0][0] - -def optimize_local_size(_prg:Callable, global_size:list[int], rawbufs:list[Buffer]) -> list[int]: - test_rawbuffers = [Buffer(rawbufs[0].device, rawbufs[0].size, rawbufs[0].dtype).allocate(), *rawbufs[1:]] if rawbufs[0] in rawbufs[1:] else rawbufs - MAX_WORKGROUP = 1024 - local_dims = [[x for x in set([sz, 1, 2, 4, 8, 16, 32, 64, 128, 256, MAX_WORKGROUP]) if x<=sz] for sz in global_size] - local_sizes = [list(x) for x in itertools.product(*local_dims) if prod(x) <= MAX_WORKGROUP] * 2 # try each valid size twice - def try_exec(local_size): - try: return _prg(*[x._buf for x in test_rawbuffers], global_size=[g//l if g%l == 0 else g/l for g,l in zip(global_size, local_size)], local_size=local_size, wait=True) # noqa: E501 - except Exception: return float('inf') - ret = min([(try_exec(local_size), local_size) for local_size in random.sample(local_sizes, len(local_sizes))]) - assert not math.isinf(ret[0]), "all optimize_local_size exec failed" - return ret[1] diff --git a/tinygrad/engine/realize.py b/tinygrad/engine/realize.py index f7358cdd6a..9f7e55a56c 100644 --- a/tinygrad/engine/realize.py +++ b/tinygrad/engine/realize.py @@ -1,8 +1,8 @@ -from typing import cast, Generator -import time, pprint, decimal +from typing import cast, Generator, Callable +import time, pprint, decimal, random, itertools, math from dataclasses import dataclass, replace, field from tinygrad.helpers import all_same, colored, DEBUG, GlobalCounters, ansilen, BEAM, NOOPT, all_int, CAPTURING, Metadata, TRACEMETA, TracingKey -from tinygrad.helpers import DEVECTORIZE, time_to_str, VALIDATE_WITH_CPU, getenv, cpu_profile, PROFILE, ProfilePointEvent, cpu_events +from tinygrad.helpers import DEVECTORIZE, time_to_str, VALIDATE_WITH_CPU, getenv, cpu_profile, PROFILE, ProfilePointEvent, cpu_events, prod from tinygrad.uop.ops import Ops, PatternMatcher, UOp, UPat, Variable, sym_infer, graph_rewrite, print_uops, track_rewrites, KernelInfo from tinygrad.device import Device, Buffer from tinygrad.renderer import Renderer, ProgramSpec, Estimates @@ -59,6 +59,20 @@ class Runner: def __call__(self, rawbufs:list[Buffer], var_vals:dict[Variable, int], wait=False) -> float|None: raise NotImplementedError("override this") +def optimize_local_size(_prg:Callable, global_size:list[int], rawbufs:list[Buffer]) -> list[int]: + test_rawbuffers = [Buffer(rawbufs[0].device, rawbufs[0].size, rawbufs[0].dtype).allocate(), *rawbufs[1:]] if rawbufs[0] in rawbufs[1:] else rawbufs + MAX_WORKGROUP = 1024 + local_dims = [[x for x in set([sz, 1, 2, 4, 8, 16, 32, 64, 128, 256, MAX_WORKGROUP]) if x<=sz] for sz in global_size] + local_sizes = [list(x) for x in itertools.product(*local_dims) if prod(x) <= MAX_WORKGROUP] * 2 # try each valid size twice + def try_exec(local_size): + try: + return _prg(*[x._buf for x in test_rawbuffers],global_size=[g//l if g%l == 0 else g/l for g,l in zip(global_size, local_size)], + local_size=local_size, wait=True) + except Exception: return float('inf') + ret = min([(try_exec(local_size), local_size) for local_size in random.sample(local_sizes, len(local_sizes))]) + assert not math.isinf(ret[0]), "all optimize_local_size exec failed" + return ret[1] + class CompiledRunner(Runner): def __init__(self, p:ProgramSpec, precompiled:bytes|None=None, prg=None): if DEBUG >= 4: print(p.src) @@ -76,8 +90,6 @@ class CompiledRunner(Runner): def __call__(self, rawbufs:list[Buffer], var_vals:dict[Variable, int], wait=False) -> float|None: global_size, local_size = self.p.launch_dims(var_vals) if global_size is not None and local_size is None and all_int(self.p.global_size): # type: ignore[arg-type] - # TODO: this is copied from get_program - from tinygrad.codegen.opt.search import optimize_local_size local_size = optimize_local_size(self._prg, global_size, rawbufs) global_size = [g//l if g%l == 0 else g/l for g,l in zip(global_size, local_size)] self.p = replace(self.p, global_size=global_size, local_size=local_size)