use_tensor_cores is a heuristic (#11989)

* use_tensor_cores is a heuristic * context
2026-06-13 00:15:35 +08:00 · 2025-09-03 17:05:10 -07:00
parent 63e930fec3
commit a5f2b4872a
5 changed files with 42 additions and 81 deletions
--- a/test/external/speed_beam_v_hcopt.py
+++ b/test/external/speed_beam_v_hcopt.py
@@ -1,41 +0,0 @@
-from tinygrad import Device
-from tinygrad.helpers import getenv, DEBUG, BEAM
-from tinygrad.codegen.opt.search import beam_search, bufs_from_lin
-from tinygrad.codegen.opt.heuristic import hand_coded_optimizations
-from extra.optimization.helpers import load_worlds, ast_str_to_lin, time_linearizer
-
-if __name__ == "__main__":
-  filter_reduce = bool(getenv("FILTER_REDUCE"))
-  ast_strs = load_worlds(filter_reduce=filter_reduce, filter_novariable=True)
-  dev = Device[Device.DEFAULT]
-
-  test_n = getenv("TEST_N", 10)
-  single = getenv("NUM", -1)
-  if single != -1: ast_strs = ast_strs[single:single+1]
-
-  beam_won, tested = 0, 0
-
-  for num, ast in enumerate(ast_strs[:test_n]):
-    def new_lin(): return ast_str_to_lin(ast, opts=dev.renderer)
-
-    k = new_lin()
-
-    if not (used_tensor_cores:=k.apply_tensor_cores(getenv("TC", 1))): k.apply_opts(hand_coded_optimizations(k))
-
-    assert BEAM > 0
-
-    lins = [(("tc" if used_tensor_cores else "hc"), k)]
-    if used_tensor_cores:
-      lins.append(("hc", new_lin()))
-      lins[-1][1].apply_opts(hand_coded_optimizations(lins[-1][1]))
-    kb = new_lin()
-    test_rawbuffers = bufs_from_lin(kb)    # allocate scratch buffers for optimization
-    lins.append((f"beam{BEAM.value}", beam_search(kb, test_rawbuffers, BEAM.value, bool(getenv("BEAM_ESTIMATE", 1)))))
-    timed = sorted([(nm, tk, time_linearizer(tk, test_rawbuffers, allow_test_size=False, clear_l2=True)) for nm, tk in lins], key=lambda x: x[2])
-    if DEBUG >= 1: print("  <  ".join(f"{nm:6s} : {lin.colored_shape(30, dense=True)} : {tm*1e6:8.2f} us" for nm, lin, tm in timed))
-
-    tested += 1
-    if timed[0][0].startswith("beam"):
-      beam_won += 1
-
-  print(f"{beam_won=} / {tested=} = {beam_won/tested:.3f}")
--- a/test/test_linearizer.py
+++ b/test/test_linearizer.py
@@ -616,7 +616,8 @@ class TestLinearizer(unittest.TestCase):
    """
    x, y = Tensor.randn(64,64), Tensor.randn(64,64)
    out = x.matmul(y)
-    k = helper_linearizer_opt(out)[-1]
+    with Context(TC=0):
+      k = helper_linearizer_opt(out)[-1]
    uops = get_program(k.ast, k.opts, k.applied_opts).uops
    # check that the float4 cast collapses
    store_vals = [u.src[1] for u in uops if u.op is Ops.STORE and u.src[0].dtype.addrspace != AddrSpace.REG]
--- a/tinygrad/codegen/opt/init.py
+++ b/tinygrad/codegen/opt/init.py
@@ -3,7 +3,7 @@
 from tinygrad.codegen.opt.kernel import Kernel
 from tinygrad.codegen.opt.heuristic import hand_coded_optimizations
 from tinygrad.uop.ops import UOp, PatternMatcher, UPat, Ops, KernelInfo
-from tinygrad.helpers import NOOPT, BEAM, USE_TC, getenv
+from tinygrad.helpers import NOOPT, BEAM, getenv
 from tinygrad.renderer import Renderer
 from tinygrad.uop.spec import type_verify

@@ -25,7 +25,7 @@ def get_optimized_ast(ast:UOp, renderer:Renderer) -> UOp|None:
  if new_arg is None:
    k = Kernel(ast, opts=renderer)
    if not NOOPT:
-      if not k.apply_tensor_cores(USE_TC.value): k.apply_opts(hand_coded_optimizations(k))
+      k.apply_opts(hand_coded_optimizations(k))
      if BEAM >= 1:
        from tinygrad.codegen.opt.search import beam_search, bufs_from_lin
        kb = Kernel(ast, opts=renderer)
--- a/tinygrad/codegen/opt/heuristic.py
+++ b/tinygrad/codegen/opt/heuristic.py
@@ -1,10 +1,46 @@
 import itertools
 from tinygrad.codegen.opt.kernel import Kernel, Opt, OptOps, KernelOptError, AxisType
-from tinygrad.helpers import getenv, DEBUG, prod, NOLOCALS
+from tinygrad.helpers import getenv, DEBUG, prod, NOLOCALS, TC_OPT, TC_SELECT, USE_TC, AMX
 from tinygrad.dtype import ImageDType
 from tinygrad.uop.ops import Ops, resolve

 def hand_coded_optimizations(k:Kernel) -> list[Opt]:
+  # first try the tensor cores
+  """ Attempts to apply a tensor core optimization to the kernel. If one exists and applies properly, return true, otherwise return false.
+  Tensor cores are optimized instructions that matrix multiply-accumulate across a wave of threads: D(M, N) = A(M, K) * B(K, N) + C(M, N).
+
+  Keyword arguments:
+  use_tensor_cores -- controls how tensor cores are applied (default 1)
+    0: will disable any tensor core matching
+    1: enable tensor cores
+    2: apply tensor core shape but don't use UOp.WMMA
+  extra_opts -- additional Opt's to apply after the tensor core instead of the hand-coded additional Opt's (default None)
+  tc_select -- specifies which tensor core(s) to use for optimization (default -1)
+    -1: iterates through all available tensor cores in order and uses the first one that matches the requirements (dims and dtypes)
+    [0-N]: uses only the n'th tensor core available; useful for search
+  tc_opt -- controls which kinds of kernels may be eligible for tensor cores application (default 2 during BEAM, 0 otherwise)
+    0: applies to only kernels with a single reduce axis and direct Ops.LOAD into Ops.MUL
+    1: allows kernels with multiple reduce axes and also multiplication of Ops.CAST'd buffers
+    2: allows kernels with M, N, K axes that are not multiples of the tensor core dimensions by applying padding those axes as needed
+  """
+  if USE_TC > 0:
+    try: # check TC first and apply hand-coded opts if successful
+      tk = k.copy()
+      tk.apply_opt(Opt(OptOps.TC, 0, (TC_SELECT.value, TC_OPT.value, USE_TC.value)))
+
+      # skip hand-coded TC opts if AMX, upcasting will make kernel slower
+      if (tc_opts:=tk.tensor_core_opts) is not None and not AMX:
+        # hand-coded TC opts
+        for tc_dim in [tc_dim for tc_dim in [1,0] if tc_opts.axes_exist[tc_dim]]: # attempt to upcast M and N
+          szs = [sz for sz in [5,4,3,2] if tk.full_shape[tc_opts.axes[tc_dim]] % sz == 0]
+          if szs: tk.apply_opt(Opt(OptOps.UPCAST, tc_opts.axes[tc_dim], szs[0]))
+
+        if tc_opts.axes_exist[0] and (szs := [sz for sz in [4,2] if tk.full_shape[tc_opts.axes[0]] % sz == 0]): # attempt to local N
+          tk.apply_opt(Opt(OptOps.LOCAL, tc_opts.axes[0], szs[0]))
+      return tk.applied_opts
+    except KernelOptError:
+      pass
+
  # make a copy so it does not mutate the input
  k = k.copy()

--- a/tinygrad/codegen/opt/kernel.py
+++ b/tinygrad/codegen/opt/kernel.py
@@ -11,7 +11,7 @@ from tinygrad.device import Device
 from tinygrad.codegen.opt.tc import TensorCore
 from tinygrad.renderer import Renderer
 from tinygrad.dtype import ImageDType
-from tinygrad.helpers import all_same, colored, ansilen, dedup, prod, round_up, to_function_name, unwrap, argfix, DEBUG, TC_SELECT, TC_OPT, AMX
+from tinygrad.helpers import all_same, colored, ansilen, dedup, prod, round_up, to_function_name, unwrap, argfix, DEBUG
 from tinygrad.shape.shapetracker import ShapeTracker
 from tinygrad.shape.view import strides_for_shape, get_contraction
 from tinygrad.codegen.opt.swizzler import view_left, view_left_through_load
@@ -399,41 +399,6 @@ class Kernel:
        return True
    return False

-  def apply_tensor_cores(self, use_tensor_cores=1) -> bool: # , extra_opts:list[Opt]|None=None) -> bool:
-    """ Attempts to apply a tensor core optimization to the kernel. If one exists and applies properly, return true, otherwise return false.
-    Tensor cores are optimized instructions that matrix multiply-accumulate across a wave of threads: D(M, N) = A(M, K) * B(K, N) + C(M, N).
-
-    Keyword arguments:
-    use_tensor_cores -- controls how tensor cores are applied (default 1)
-      0: will disable any tensor core matching
-      1: enable tensor cores
-      2: apply tensor core shape but don't use UOp.WMMA
-    extra_opts -- additional Opt's to apply after the tensor core instead of the hand-coded additional Opt's (default None)
-    tc_select -- specifies which tensor core(s) to use for optimization (default -1)
-      -1: iterates through all available tensor cores in order and uses the first one that matches the requirements (dims and dtypes)
-      [0-N]: uses only the n'th tensor core available; useful for search
-    tc_opt -- controls which kinds of kernels may be eligible for tensor cores application (default 2 during BEAM, 0 otherwise)
-      0: applies to only kernels with a single reduce axis and direct Ops.LOAD into Ops.MUL
-      1: allows kernels with multiple reduce axes and also multiplication of Ops.CAST'd buffers
-      2: allows kernels with M, N, K axes that are not multiples of the tensor core dimensions by applying padding those axes as needed
-    """
-    if not self.opts.tensor_cores: return False
-    try: # check TC first and apply hand-coded opts if successful
-      self.apply_opt(Opt(OptOps.TC, 0, (TC_SELECT.value, TC_OPT.value, use_tensor_cores)))
-
-      if (tc_opts:=self.tensor_core_opts) is not None:
-        if AMX: return True # skip hand-coded TC opts if AMX, upcasting will make kernel slower
-        # hand-coded TC opts
-        for tc_dim in [tc_dim for tc_dim in [1,0] if tc_opts.axes_exist[tc_dim]]: # attempt to upcast M and N
-          szs = [sz for sz in [5,4,3,2] if self.full_shape[tc_opts.axes[tc_dim]] % sz == 0]
-          if szs: self.apply_opt(Opt(OptOps.UPCAST, tc_opts.axes[tc_dim], szs[0]))
-
-        if tc_opts.axes_exist[0] and (szs := [sz for sz in [4,2] if self.full_shape[tc_opts.axes[0]] % sz == 0]): # attempt to local N
-          self.apply_opt(Opt(OptOps.LOCAL, tc_opts.axes[0], szs[0]))
-      return True
-    except KernelOptError:
-      return False
-
  # strings like ['g0', 'g1', 'l0', 'l1', 'l2', 'l3', 'l4', 'l5', 'R0', 'r0', 'r1', 'r2', 'u0', 'u1', 'u2']
  def shape_str(self) -> list[str]:
    ret: list[str] = []