use_tensor_cores is a heuristic (#11989)

* use_tensor_cores is a heuristic

* context
This commit is contained in:
George Hotz
2025-09-03 17:05:10 -07:00
committed by GitHub
parent 63e930fec3
commit a5f2b4872a
5 changed files with 42 additions and 81 deletions

View File

@@ -1,41 +0,0 @@
from tinygrad import Device
from tinygrad.helpers import getenv, DEBUG, BEAM
from tinygrad.codegen.opt.search import beam_search, bufs_from_lin
from tinygrad.codegen.opt.heuristic import hand_coded_optimizations
from extra.optimization.helpers import load_worlds, ast_str_to_lin, time_linearizer
if __name__ == "__main__":
filter_reduce = bool(getenv("FILTER_REDUCE"))
ast_strs = load_worlds(filter_reduce=filter_reduce, filter_novariable=True)
dev = Device[Device.DEFAULT]
test_n = getenv("TEST_N", 10)
single = getenv("NUM", -1)
if single != -1: ast_strs = ast_strs[single:single+1]
beam_won, tested = 0, 0
for num, ast in enumerate(ast_strs[:test_n]):
def new_lin(): return ast_str_to_lin(ast, opts=dev.renderer)
k = new_lin()
if not (used_tensor_cores:=k.apply_tensor_cores(getenv("TC", 1))): k.apply_opts(hand_coded_optimizations(k))
assert BEAM > 0
lins = [(("tc" if used_tensor_cores else "hc"), k)]
if used_tensor_cores:
lins.append(("hc", new_lin()))
lins[-1][1].apply_opts(hand_coded_optimizations(lins[-1][1]))
kb = new_lin()
test_rawbuffers = bufs_from_lin(kb) # allocate scratch buffers for optimization
lins.append((f"beam{BEAM.value}", beam_search(kb, test_rawbuffers, BEAM.value, bool(getenv("BEAM_ESTIMATE", 1)))))
timed = sorted([(nm, tk, time_linearizer(tk, test_rawbuffers, allow_test_size=False, clear_l2=True)) for nm, tk in lins], key=lambda x: x[2])
if DEBUG >= 1: print(" < ".join(f"{nm:6s} : {lin.colored_shape(30, dense=True)} : {tm*1e6:8.2f} us" for nm, lin, tm in timed))
tested += 1
if timed[0][0].startswith("beam"):
beam_won += 1
print(f"{beam_won=} / {tested=} = {beam_won/tested:.3f}")

View File

@@ -616,7 +616,8 @@ class TestLinearizer(unittest.TestCase):
"""
x, y = Tensor.randn(64,64), Tensor.randn(64,64)
out = x.matmul(y)
k = helper_linearizer_opt(out)[-1]
with Context(TC=0):
k = helper_linearizer_opt(out)[-1]
uops = get_program(k.ast, k.opts, k.applied_opts).uops
# check that the float4 cast collapses
store_vals = [u.src[1] for u in uops if u.op is Ops.STORE and u.src[0].dtype.addrspace != AddrSpace.REG]

View File

@@ -3,7 +3,7 @@
from tinygrad.codegen.opt.kernel import Kernel
from tinygrad.codegen.opt.heuristic import hand_coded_optimizations
from tinygrad.uop.ops import UOp, PatternMatcher, UPat, Ops, KernelInfo
from tinygrad.helpers import NOOPT, BEAM, USE_TC, getenv
from tinygrad.helpers import NOOPT, BEAM, getenv
from tinygrad.renderer import Renderer
from tinygrad.uop.spec import type_verify
@@ -25,7 +25,7 @@ def get_optimized_ast(ast:UOp, renderer:Renderer) -> UOp|None:
if new_arg is None:
k = Kernel(ast, opts=renderer)
if not NOOPT:
if not k.apply_tensor_cores(USE_TC.value): k.apply_opts(hand_coded_optimizations(k))
k.apply_opts(hand_coded_optimizations(k))
if BEAM >= 1:
from tinygrad.codegen.opt.search import beam_search, bufs_from_lin
kb = Kernel(ast, opts=renderer)

View File

@@ -1,10 +1,46 @@
import itertools
from tinygrad.codegen.opt.kernel import Kernel, Opt, OptOps, KernelOptError, AxisType
from tinygrad.helpers import getenv, DEBUG, prod, NOLOCALS
from tinygrad.helpers import getenv, DEBUG, prod, NOLOCALS, TC_OPT, TC_SELECT, USE_TC, AMX
from tinygrad.dtype import ImageDType
from tinygrad.uop.ops import Ops, resolve
def hand_coded_optimizations(k:Kernel) -> list[Opt]:
# first try the tensor cores
""" Attempts to apply a tensor core optimization to the kernel. If one exists and applies properly, return true, otherwise return false.
Tensor cores are optimized instructions that matrix multiply-accumulate across a wave of threads: D(M, N) = A(M, K) * B(K, N) + C(M, N).
Keyword arguments:
use_tensor_cores -- controls how tensor cores are applied (default 1)
0: will disable any tensor core matching
1: enable tensor cores
2: apply tensor core shape but don't use UOp.WMMA
extra_opts -- additional Opt's to apply after the tensor core instead of the hand-coded additional Opt's (default None)
tc_select -- specifies which tensor core(s) to use for optimization (default -1)
-1: iterates through all available tensor cores in order and uses the first one that matches the requirements (dims and dtypes)
[0-N]: uses only the n'th tensor core available; useful for search
tc_opt -- controls which kinds of kernels may be eligible for tensor cores application (default 2 during BEAM, 0 otherwise)
0: applies to only kernels with a single reduce axis and direct Ops.LOAD into Ops.MUL
1: allows kernels with multiple reduce axes and also multiplication of Ops.CAST'd buffers
2: allows kernels with M, N, K axes that are not multiples of the tensor core dimensions by applying padding those axes as needed
"""
if USE_TC > 0:
try: # check TC first and apply hand-coded opts if successful
tk = k.copy()
tk.apply_opt(Opt(OptOps.TC, 0, (TC_SELECT.value, TC_OPT.value, USE_TC.value)))
# skip hand-coded TC opts if AMX, upcasting will make kernel slower
if (tc_opts:=tk.tensor_core_opts) is not None and not AMX:
# hand-coded TC opts
for tc_dim in [tc_dim for tc_dim in [1,0] if tc_opts.axes_exist[tc_dim]]: # attempt to upcast M and N
szs = [sz for sz in [5,4,3,2] if tk.full_shape[tc_opts.axes[tc_dim]] % sz == 0]
if szs: tk.apply_opt(Opt(OptOps.UPCAST, tc_opts.axes[tc_dim], szs[0]))
if tc_opts.axes_exist[0] and (szs := [sz for sz in [4,2] if tk.full_shape[tc_opts.axes[0]] % sz == 0]): # attempt to local N
tk.apply_opt(Opt(OptOps.LOCAL, tc_opts.axes[0], szs[0]))
return tk.applied_opts
except KernelOptError:
pass
# make a copy so it does not mutate the input
k = k.copy()

View File

@@ -11,7 +11,7 @@ from tinygrad.device import Device
from tinygrad.codegen.opt.tc import TensorCore
from tinygrad.renderer import Renderer
from tinygrad.dtype import ImageDType
from tinygrad.helpers import all_same, colored, ansilen, dedup, prod, round_up, to_function_name, unwrap, argfix, DEBUG, TC_SELECT, TC_OPT, AMX
from tinygrad.helpers import all_same, colored, ansilen, dedup, prod, round_up, to_function_name, unwrap, argfix, DEBUG
from tinygrad.shape.shapetracker import ShapeTracker
from tinygrad.shape.view import strides_for_shape, get_contraction
from tinygrad.codegen.opt.swizzler import view_left, view_left_through_load
@@ -399,41 +399,6 @@ class Kernel:
return True
return False
def apply_tensor_cores(self, use_tensor_cores=1) -> bool: # , extra_opts:list[Opt]|None=None) -> bool:
""" Attempts to apply a tensor core optimization to the kernel. If one exists and applies properly, return true, otherwise return false.
Tensor cores are optimized instructions that matrix multiply-accumulate across a wave of threads: D(M, N) = A(M, K) * B(K, N) + C(M, N).
Keyword arguments:
use_tensor_cores -- controls how tensor cores are applied (default 1)
0: will disable any tensor core matching
1: enable tensor cores
2: apply tensor core shape but don't use UOp.WMMA
extra_opts -- additional Opt's to apply after the tensor core instead of the hand-coded additional Opt's (default None)
tc_select -- specifies which tensor core(s) to use for optimization (default -1)
-1: iterates through all available tensor cores in order and uses the first one that matches the requirements (dims and dtypes)
[0-N]: uses only the n'th tensor core available; useful for search
tc_opt -- controls which kinds of kernels may be eligible for tensor cores application (default 2 during BEAM, 0 otherwise)
0: applies to only kernels with a single reduce axis and direct Ops.LOAD into Ops.MUL
1: allows kernels with multiple reduce axes and also multiplication of Ops.CAST'd buffers
2: allows kernels with M, N, K axes that are not multiples of the tensor core dimensions by applying padding those axes as needed
"""
if not self.opts.tensor_cores: return False
try: # check TC first and apply hand-coded opts if successful
self.apply_opt(Opt(OptOps.TC, 0, (TC_SELECT.value, TC_OPT.value, use_tensor_cores)))
if (tc_opts:=self.tensor_core_opts) is not None:
if AMX: return True # skip hand-coded TC opts if AMX, upcasting will make kernel slower
# hand-coded TC opts
for tc_dim in [tc_dim for tc_dim in [1,0] if tc_opts.axes_exist[tc_dim]]: # attempt to upcast M and N
szs = [sz for sz in [5,4,3,2] if self.full_shape[tc_opts.axes[tc_dim]] % sz == 0]
if szs: self.apply_opt(Opt(OptOps.UPCAST, tc_opts.axes[tc_dim], szs[0]))
if tc_opts.axes_exist[0] and (szs := [sz for sz in [4,2] if self.full_shape[tc_opts.axes[0]] % sz == 0]): # attempt to local N
self.apply_opt(Opt(OptOps.LOCAL, tc_opts.axes[0], szs[0]))
return True
except KernelOptError:
return False
# strings like ['g0', 'g1', 'l0', 'l1', 'l2', 'l3', 'l4', 'l5', 'R0', 'r0', 'r1', 'r2', 'u0', 'u1', 'u2']
def shape_str(self) -> list[str]:
ret: list[str] = []