mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-06-13 00:15:35 +08:00
use_tensor_cores is a heuristic (#11989)
* use_tensor_cores is a heuristic * context
This commit is contained in:
41
test/external/speed_beam_v_hcopt.py
vendored
41
test/external/speed_beam_v_hcopt.py
vendored
@@ -1,41 +0,0 @@
|
||||
from tinygrad import Device
|
||||
from tinygrad.helpers import getenv, DEBUG, BEAM
|
||||
from tinygrad.codegen.opt.search import beam_search, bufs_from_lin
|
||||
from tinygrad.codegen.opt.heuristic import hand_coded_optimizations
|
||||
from extra.optimization.helpers import load_worlds, ast_str_to_lin, time_linearizer
|
||||
|
||||
if __name__ == "__main__":
|
||||
filter_reduce = bool(getenv("FILTER_REDUCE"))
|
||||
ast_strs = load_worlds(filter_reduce=filter_reduce, filter_novariable=True)
|
||||
dev = Device[Device.DEFAULT]
|
||||
|
||||
test_n = getenv("TEST_N", 10)
|
||||
single = getenv("NUM", -1)
|
||||
if single != -1: ast_strs = ast_strs[single:single+1]
|
||||
|
||||
beam_won, tested = 0, 0
|
||||
|
||||
for num, ast in enumerate(ast_strs[:test_n]):
|
||||
def new_lin(): return ast_str_to_lin(ast, opts=dev.renderer)
|
||||
|
||||
k = new_lin()
|
||||
|
||||
if not (used_tensor_cores:=k.apply_tensor_cores(getenv("TC", 1))): k.apply_opts(hand_coded_optimizations(k))
|
||||
|
||||
assert BEAM > 0
|
||||
|
||||
lins = [(("tc" if used_tensor_cores else "hc"), k)]
|
||||
if used_tensor_cores:
|
||||
lins.append(("hc", new_lin()))
|
||||
lins[-1][1].apply_opts(hand_coded_optimizations(lins[-1][1]))
|
||||
kb = new_lin()
|
||||
test_rawbuffers = bufs_from_lin(kb) # allocate scratch buffers for optimization
|
||||
lins.append((f"beam{BEAM.value}", beam_search(kb, test_rawbuffers, BEAM.value, bool(getenv("BEAM_ESTIMATE", 1)))))
|
||||
timed = sorted([(nm, tk, time_linearizer(tk, test_rawbuffers, allow_test_size=False, clear_l2=True)) for nm, tk in lins], key=lambda x: x[2])
|
||||
if DEBUG >= 1: print(" < ".join(f"{nm:6s} : {lin.colored_shape(30, dense=True)} : {tm*1e6:8.2f} us" for nm, lin, tm in timed))
|
||||
|
||||
tested += 1
|
||||
if timed[0][0].startswith("beam"):
|
||||
beam_won += 1
|
||||
|
||||
print(f"{beam_won=} / {tested=} = {beam_won/tested:.3f}")
|
||||
@@ -616,7 +616,8 @@ class TestLinearizer(unittest.TestCase):
|
||||
"""
|
||||
x, y = Tensor.randn(64,64), Tensor.randn(64,64)
|
||||
out = x.matmul(y)
|
||||
k = helper_linearizer_opt(out)[-1]
|
||||
with Context(TC=0):
|
||||
k = helper_linearizer_opt(out)[-1]
|
||||
uops = get_program(k.ast, k.opts, k.applied_opts).uops
|
||||
# check that the float4 cast collapses
|
||||
store_vals = [u.src[1] for u in uops if u.op is Ops.STORE and u.src[0].dtype.addrspace != AddrSpace.REG]
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
from tinygrad.codegen.opt.kernel import Kernel
|
||||
from tinygrad.codegen.opt.heuristic import hand_coded_optimizations
|
||||
from tinygrad.uop.ops import UOp, PatternMatcher, UPat, Ops, KernelInfo
|
||||
from tinygrad.helpers import NOOPT, BEAM, USE_TC, getenv
|
||||
from tinygrad.helpers import NOOPT, BEAM, getenv
|
||||
from tinygrad.renderer import Renderer
|
||||
from tinygrad.uop.spec import type_verify
|
||||
|
||||
@@ -25,7 +25,7 @@ def get_optimized_ast(ast:UOp, renderer:Renderer) -> UOp|None:
|
||||
if new_arg is None:
|
||||
k = Kernel(ast, opts=renderer)
|
||||
if not NOOPT:
|
||||
if not k.apply_tensor_cores(USE_TC.value): k.apply_opts(hand_coded_optimizations(k))
|
||||
k.apply_opts(hand_coded_optimizations(k))
|
||||
if BEAM >= 1:
|
||||
from tinygrad.codegen.opt.search import beam_search, bufs_from_lin
|
||||
kb = Kernel(ast, opts=renderer)
|
||||
|
||||
@@ -1,10 +1,46 @@
|
||||
import itertools
|
||||
from tinygrad.codegen.opt.kernel import Kernel, Opt, OptOps, KernelOptError, AxisType
|
||||
from tinygrad.helpers import getenv, DEBUG, prod, NOLOCALS
|
||||
from tinygrad.helpers import getenv, DEBUG, prod, NOLOCALS, TC_OPT, TC_SELECT, USE_TC, AMX
|
||||
from tinygrad.dtype import ImageDType
|
||||
from tinygrad.uop.ops import Ops, resolve
|
||||
|
||||
def hand_coded_optimizations(k:Kernel) -> list[Opt]:
|
||||
# first try the tensor cores
|
||||
""" Attempts to apply a tensor core optimization to the kernel. If one exists and applies properly, return true, otherwise return false.
|
||||
Tensor cores are optimized instructions that matrix multiply-accumulate across a wave of threads: D(M, N) = A(M, K) * B(K, N) + C(M, N).
|
||||
|
||||
Keyword arguments:
|
||||
use_tensor_cores -- controls how tensor cores are applied (default 1)
|
||||
0: will disable any tensor core matching
|
||||
1: enable tensor cores
|
||||
2: apply tensor core shape but don't use UOp.WMMA
|
||||
extra_opts -- additional Opt's to apply after the tensor core instead of the hand-coded additional Opt's (default None)
|
||||
tc_select -- specifies which tensor core(s) to use for optimization (default -1)
|
||||
-1: iterates through all available tensor cores in order and uses the first one that matches the requirements (dims and dtypes)
|
||||
[0-N]: uses only the n'th tensor core available; useful for search
|
||||
tc_opt -- controls which kinds of kernels may be eligible for tensor cores application (default 2 during BEAM, 0 otherwise)
|
||||
0: applies to only kernels with a single reduce axis and direct Ops.LOAD into Ops.MUL
|
||||
1: allows kernels with multiple reduce axes and also multiplication of Ops.CAST'd buffers
|
||||
2: allows kernels with M, N, K axes that are not multiples of the tensor core dimensions by applying padding those axes as needed
|
||||
"""
|
||||
if USE_TC > 0:
|
||||
try: # check TC first and apply hand-coded opts if successful
|
||||
tk = k.copy()
|
||||
tk.apply_opt(Opt(OptOps.TC, 0, (TC_SELECT.value, TC_OPT.value, USE_TC.value)))
|
||||
|
||||
# skip hand-coded TC opts if AMX, upcasting will make kernel slower
|
||||
if (tc_opts:=tk.tensor_core_opts) is not None and not AMX:
|
||||
# hand-coded TC opts
|
||||
for tc_dim in [tc_dim for tc_dim in [1,0] if tc_opts.axes_exist[tc_dim]]: # attempt to upcast M and N
|
||||
szs = [sz for sz in [5,4,3,2] if tk.full_shape[tc_opts.axes[tc_dim]] % sz == 0]
|
||||
if szs: tk.apply_opt(Opt(OptOps.UPCAST, tc_opts.axes[tc_dim], szs[0]))
|
||||
|
||||
if tc_opts.axes_exist[0] and (szs := [sz for sz in [4,2] if tk.full_shape[tc_opts.axes[0]] % sz == 0]): # attempt to local N
|
||||
tk.apply_opt(Opt(OptOps.LOCAL, tc_opts.axes[0], szs[0]))
|
||||
return tk.applied_opts
|
||||
except KernelOptError:
|
||||
pass
|
||||
|
||||
# make a copy so it does not mutate the input
|
||||
k = k.copy()
|
||||
|
||||
|
||||
@@ -11,7 +11,7 @@ from tinygrad.device import Device
|
||||
from tinygrad.codegen.opt.tc import TensorCore
|
||||
from tinygrad.renderer import Renderer
|
||||
from tinygrad.dtype import ImageDType
|
||||
from tinygrad.helpers import all_same, colored, ansilen, dedup, prod, round_up, to_function_name, unwrap, argfix, DEBUG, TC_SELECT, TC_OPT, AMX
|
||||
from tinygrad.helpers import all_same, colored, ansilen, dedup, prod, round_up, to_function_name, unwrap, argfix, DEBUG
|
||||
from tinygrad.shape.shapetracker import ShapeTracker
|
||||
from tinygrad.shape.view import strides_for_shape, get_contraction
|
||||
from tinygrad.codegen.opt.swizzler import view_left, view_left_through_load
|
||||
@@ -399,41 +399,6 @@ class Kernel:
|
||||
return True
|
||||
return False
|
||||
|
||||
def apply_tensor_cores(self, use_tensor_cores=1) -> bool: # , extra_opts:list[Opt]|None=None) -> bool:
|
||||
""" Attempts to apply a tensor core optimization to the kernel. If one exists and applies properly, return true, otherwise return false.
|
||||
Tensor cores are optimized instructions that matrix multiply-accumulate across a wave of threads: D(M, N) = A(M, K) * B(K, N) + C(M, N).
|
||||
|
||||
Keyword arguments:
|
||||
use_tensor_cores -- controls how tensor cores are applied (default 1)
|
||||
0: will disable any tensor core matching
|
||||
1: enable tensor cores
|
||||
2: apply tensor core shape but don't use UOp.WMMA
|
||||
extra_opts -- additional Opt's to apply after the tensor core instead of the hand-coded additional Opt's (default None)
|
||||
tc_select -- specifies which tensor core(s) to use for optimization (default -1)
|
||||
-1: iterates through all available tensor cores in order and uses the first one that matches the requirements (dims and dtypes)
|
||||
[0-N]: uses only the n'th tensor core available; useful for search
|
||||
tc_opt -- controls which kinds of kernels may be eligible for tensor cores application (default 2 during BEAM, 0 otherwise)
|
||||
0: applies to only kernels with a single reduce axis and direct Ops.LOAD into Ops.MUL
|
||||
1: allows kernels with multiple reduce axes and also multiplication of Ops.CAST'd buffers
|
||||
2: allows kernels with M, N, K axes that are not multiples of the tensor core dimensions by applying padding those axes as needed
|
||||
"""
|
||||
if not self.opts.tensor_cores: return False
|
||||
try: # check TC first and apply hand-coded opts if successful
|
||||
self.apply_opt(Opt(OptOps.TC, 0, (TC_SELECT.value, TC_OPT.value, use_tensor_cores)))
|
||||
|
||||
if (tc_opts:=self.tensor_core_opts) is not None:
|
||||
if AMX: return True # skip hand-coded TC opts if AMX, upcasting will make kernel slower
|
||||
# hand-coded TC opts
|
||||
for tc_dim in [tc_dim for tc_dim in [1,0] if tc_opts.axes_exist[tc_dim]]: # attempt to upcast M and N
|
||||
szs = [sz for sz in [5,4,3,2] if self.full_shape[tc_opts.axes[tc_dim]] % sz == 0]
|
||||
if szs: self.apply_opt(Opt(OptOps.UPCAST, tc_opts.axes[tc_dim], szs[0]))
|
||||
|
||||
if tc_opts.axes_exist[0] and (szs := [sz for sz in [4,2] if self.full_shape[tc_opts.axes[0]] % sz == 0]): # attempt to local N
|
||||
self.apply_opt(Opt(OptOps.LOCAL, tc_opts.axes[0], szs[0]))
|
||||
return True
|
||||
except KernelOptError:
|
||||
return False
|
||||
|
||||
# strings like ['g0', 'g1', 'l0', 'l1', 'l2', 'l3', 'l4', 'l5', 'R0', 'r0', 'r1', 'r2', 'u0', 'u1', 'u2']
|
||||
def shape_str(self) -> list[str]:
|
||||
ret: list[str] = []
|
||||
|
||||
Reference in New Issue
Block a user