diff --git a/test/test_linearizer.py b/test/test_linearizer.py index 746795dc12..89cd4ccf33 100644 --- a/test/test_linearizer.py +++ b/test/test_linearizer.py @@ -13,7 +13,8 @@ from tinygrad.shape.symbolic import MulNode, Variable, NumNode, Node from tinygrad.tensor import Tensor from tinygrad.engine.schedule import create_schedule from tinygrad.engine.realize import run_schedule, lower_schedule, CompiledRunner -from tinygrad.helpers import prod, Context, getenv, CI +from tinygrad.engine.graph import print_tree +from tinygrad.helpers import DEBUG, prod, Context, getenv, CI from tinygrad.dtype import DType, dtypes def helper_realized_ast(r:Tensor): @@ -977,6 +978,23 @@ def _helper_linearizer_opt_ast(realized_ast:Tuple[LazyOp, ...], real_bufs:List[B check_opt(x, lambda: Linearizer(*realized_ast), color_sizes[i] if i < len(color_sizes) else None) return lins +# creates a back-to-back multi reduce AST by merging r0 and r1. +# TODO: delete once we can schedule multi reduce +def _temp_create_multireduce_ast(r0:Tensor, r1:Tensor, merge=lambda r0,r1: LazyOp(BinaryOps.ADD, (r0, r1))) -> Tuple[LazyOp, ...]: + assert len(s0:=r0.schedule()) == 1 and len(s1:=r1.schedule()) == 1, "inputs should be realized" + op0, op1 = s0[0].ast[0].src[0], s1[0].ast[0].src[0] + def _deep_replace(op:LazyOp, offset=0): + if op.op is BufferOps.LOAD: arg = MemBuffer(op.arg.idx+offset, op.arg.dtype, op.arg.st) + else: arg = op.arg + return LazyOp(op.op, tuple(_deep_replace(x, offset) for x in op.src), arg) + # limitation: r0 and r1 cannot share inputs. + op0_loads = len([x for x in op0.lazyops if x.op is BufferOps.LOAD]) + out = merge(_deep_replace(op0), _deep_replace(op1, op0_loads)) + # limitation: only tests single output + op = LazyOp(BufferOps.STORE, (out, ), MemBuffer(0, s0[-1].ast[-1].arg.dtype, s0[-1].ast[-1].arg.st)) + if DEBUG >= 3: print_tree(op) + return op, + class TestKernelOpts(unittest.TestCase): @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_local, "test requires locals") @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_shared, "test requires shared") @@ -1002,6 +1020,36 @@ class TestKernelOpts(unittest.TestCase): [Opt(OptOps.LOCAL, 0, 2), Opt(OptOps.GROUPTOP, 0, 2), Opt(OptOps.UPCAST, 0, 8), Opt(OptOps.UNROLL, 1, 4)], ]) + @unittest.skip("multireduce isn't supported yet") + @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_local, "test requires locals") + @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_shared, "test requires shared") + def test_local_and_grouped_reduce_multireduce(self): + N = 128 + Tensor.manual_seed(1882) + a = Tensor.rand(4, 4, N, N).realize() + b = Tensor.rand(4, 4, N).realize() + # TODO: this isn't the best AST, it's always math.inf + r0 = (b.sqrt() + ((a+1).sum(axis=3).exp())) + c = Tensor.rand(4, 4, N, N).realize() + d = Tensor.rand(4, 4, N).realize() + r1 = (d.sqrt() + ((c+1).sum(axis=3).exp())) + ast = _temp_create_multireduce_ast(r0, r1) + helper_linearizer_ast(ast, [a, b, c, d], [ + [Opt(OptOps.LOCAL, 0, 2)], + [Opt(OptOps.LOCAL, 0, 8)], + [Opt(OptOps.LOCAL, 0, 16)], # Checking how it works with locals + [Opt(OptOps.GROUPTOP, 0, 2)], + [Opt(OptOps.GROUPTOP, 0, 32)], + [Opt(OptOps.GROUPTOP, 0, 64)], # Checking how it works with grouped reduce + [Opt(OptOps.LOCAL, 0, 2), Opt(OptOps.GROUPTOP, 0, 2)], + [Opt(OptOps.LOCAL, 0, 16), Opt(OptOps.GROUPTOP, 0, 16)], + [Opt(OptOps.LOCAL, 0, 32), Opt(OptOps.GROUPTOP, 0, 2)], + # Checking how it works with locals + grouped reduce + [Opt(OptOps.LOCAL, 0, 2), Opt(OptOps.GROUPTOP, 0, 64)], + # Checking how it works with locals + grouped reduce + upcasts + [Opt(OptOps.LOCAL, 0, 2), Opt(OptOps.GROUPTOP, 0, 2), Opt(OptOps.UPCAST, 0, 8), Opt(OptOps.UNROLL, 1, 4)], + ]) + def test_upcasts(self): N = 16 Tensor.manual_seed(1772) @@ -1052,6 +1100,40 @@ class TestKernelOpts(unittest.TestCase): [Opt(OptOps.LOCAL, 0, 4), Opt(OptOps.LOCAL, 0, 4), Opt(OptOps.GROUPTOP, 0, 8), Opt(OptOps.UNROLL, 0, 4), Opt(OptOps.UPCAST, 0, 8)], ]) + @unittest.skip("multireduce isn't supported yet") + @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_local, "test requires locals") + @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_shared, "test requires shared") + def test_matmul_multireduce(self): + N = 128 + Tensor.manual_seed(1552) + a = Tensor.rand(N, N).realize() + b = Tensor.rand(N, N).realize() + r0 = a@b + c = Tensor.rand(N, N).realize() + d = Tensor.rand(N, N).realize() + r1 = c@d + ast = _temp_create_multireduce_ast(r0, r1) + helper_linearizer_ast(ast, [a, b, c, d], [ + [Opt(OptOps.UPCAST, 0, 2)], + [Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UPCAST, 1, 4)], # Checking how it works with upcasts + [Opt(OptOps.LOCAL, 0, 2)], + [Opt(OptOps.LOCAL, 1, 32)], + [Opt(OptOps.LOCAL, 0, 4), Opt(OptOps.LOCAL, 1, 4)], + [Opt(OptOps.LOCAL, 0, 4), Opt(OptOps.LOCAL, 1, 32)], + [Opt(OptOps.LOCAL, 0, 16), Opt(OptOps.LOCAL, 1, 8)], # Checking how it works with locals + [Opt(OptOps.GROUPTOP, 0, 2)], + [Opt(OptOps.GROUPTOP, 0, 32)], + [Opt(OptOps.GROUPTOP, 0, 32), Opt(OptOps.UNROLL, 0, 4)], # Checking how it works with grouped_reduce + [Opt(OptOps.LOCAL, 0, 2), Opt(OptOps.LOCAL, 1, 2), Opt(OptOps.GROUPTOP, 0, 32)], + [Opt(OptOps.LOCAL, 0, 8), Opt(OptOps.GROUPTOP, 0, 32)], + [Opt(OptOps.LOCAL, 0, 4), Opt(OptOps.LOCAL, 0, 8), Opt(OptOps.GROUPTOP, 0, 4)], # Checking how it works with local+grouped_reduce + # Checking all together + [Opt(OptOps.LOCAL, 0, 4), Opt(OptOps.LOCAL, 0, 4), Opt(OptOps.GROUPTOP, 0, 8), Opt(OptOps.UNROLL, 0, 4), Opt(OptOps.UPCAST, 0, 4), + Opt(OptOps.UPCAST, 1, 2)], + # Full global upcast + local + [Opt(OptOps.LOCAL, 0, 4), Opt(OptOps.LOCAL, 0, 4), Opt(OptOps.GROUPTOP, 0, 8), Opt(OptOps.UNROLL, 0, 4), Opt(OptOps.UPCAST, 0, 8)], + ], wanna_output=[(a.numpy()@b.numpy()+c.numpy()@d.numpy()).flatten()]) + @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_local, "test requires locals") @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_shared, "test requires shared") def test_double_reduce(self): @@ -1078,6 +1160,36 @@ class TestKernelOpts(unittest.TestCase): Opt(OptOps.UPCAST, 0, 2)], # No globals ]) + @unittest.skip("multireduce isn't supported yet") + @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_local, "test requires locals") + @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_shared, "test requires shared") + def test_double_reduce_multireudce(self): + N = 128 + Tensor.manual_seed(1552) + a = Tensor.rand(8, N, 8, N).realize() + r0 = a.sum(axis=(1,3)) + b = Tensor.rand(8, N, 8, N).realize() + r1 = b.sum(axis=(1,3)) + ast = _temp_create_multireduce_ast(r0, r1) + helper_linearizer_ast(ast, [a, b], [ + # openCL / GPU=1 is 256 max threads + [Opt(OptOps.GROUPTOP, 0, 2)], [Opt(OptOps.GROUPTOP, 0, 32)], + [Opt(OptOps.GROUPTOP, 1, 2)], [Opt(OptOps.GROUPTOP, 1, 32)], # Checking how it works with 1 grouped_reduce. + [Opt(OptOps.GROUPTOP, 0, 2), Opt(OptOps.GROUPTOP, 1, 2)], + [Opt(OptOps.GROUPTOP, 0, 16), Opt(OptOps.GROUPTOP, 1, 2)], + [Opt(OptOps.GROUPTOP, 0, 4), Opt(OptOps.GROUPTOP, 1, 64)], # Checking how it works with 2 grouped_reduces. + [Opt(OptOps.GROUPTOP, 0, 16), Opt(OptOps.GROUPTOP, 1, 2), Opt(OptOps.UNROLL, 0, 4)], + [Opt(OptOps.GROUPTOP, 0, 2), Opt(OptOps.GROUPTOP, 1, 32), Opt(OptOps.UNROLL, 2, 4)], # Checking how it works with 2 grouped_reduces + upcasts. + [Opt(OptOps.LOCAL, 0, 4), Opt(OptOps.LOCAL, 1, 4), Opt(OptOps.GROUPTOP, 0, 4), Opt(OptOps.GROUPTOP, 1, 4)], + # Checking how it works with 2 grouped_reduces + upcasts + locals. + [Opt(OptOps.LOCAL, 0, 4), Opt(OptOps.LOCAL, 1, 4), Opt(OptOps.GROUPTOP, 0, 2), Opt(OptOps.GROUPTOP, 1, 32), Opt(OptOps.UNROLL, 1, 4)], + [Opt(OptOps.LOCAL, 0, 2), Opt(OptOps.LOCAL, 1, 2), Opt(OptOps.GROUPTOP, 0, 8), Opt(OptOps.GROUPTOP, 1, 4), Opt(OptOps.UPCAST, 0, 2)], + [Opt(OptOps.LOCAL, 0, 2), Opt(OptOps.LOCAL, 1, 2), Opt(OptOps.GROUPTOP, 0, 8), Opt(OptOps.GROUPTOP, 1, 4), Opt(OptOps.UPCAST, 0, 2), + Opt(OptOps.UNROLL, 0, 4), Opt(OptOps.UNROLL, 1, 4)], # Checking how it works with 2 grouped_reduces + upcasts + locals. + [Opt(OptOps.LOCAL, 0, 4), Opt(OptOps.LOCAL, 1, 4), Opt(OptOps.GROUPTOP, 0, 4), Opt(OptOps.GROUPTOP, 1, 4), Opt(OptOps.UPCAST, 0, 2), + Opt(OptOps.UPCAST, 0, 2)], # No globals + ], wanna_output=[(a.numpy().sum(axis=(1, 3))+b.numpy().sum(axis=(1, 3))).flatten()]) + @unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores") def test_invalid_tensor_core_extra_opts(self): N = 128 @@ -1132,6 +1244,40 @@ class TestKernelOpts(unittest.TestCase): # [Opt(OptOps.GROUP, 0, 2)] # doesn't work because group_for_reduce dims become early locals (conflicting with TC) ], apply_tc=True, atol=atol, rtol=rtol) + @unittest.skip("multireduce isn't supported yet") + @unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores") + def test_tensor_core_opts_multireduce(self): + N = 128 + Tensor.manual_seed(1552) + for tc in Device[Device.DEFAULT].renderer.tensor_cores: + # bf16 buffer returns float32 numpy outputs so test would fail. testing opt with half suffices. + if tc.dtype_in == dtypes.bfloat16: continue + a, b = Tensor.rand(N, N, dtype=tc.dtype_in).realize(), Tensor.rand(N, N, dtype=tc.dtype_in).realize() + r0 = a.matmul(b, acc_dtype=tc.dtype_out) + c, d = Tensor.rand(N, N, dtype=tc.dtype_in).realize(), Tensor.rand(N, N, dtype=tc.dtype_in).realize() + r1 = c.matmul(d, acc_dtype=tc.dtype_out) + ast = _temp_create_multireduce_ast(r0, r1) + (atol, rtol) = ((0.25, 0.01) if tc.dtype_out == dtypes.half else (3e-2, 1e-3)) if tc.dtype_in == dtypes.half else (1e-4, 1e-4) + helper_linearizer_ast(ast, [a, b, c, d], [ + [], + [Opt(OptOps.UPCAST, 0, 4)], + [Opt(OptOps.UPCAST, 1, 4)], + [Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UPCAST, 1, 4)], # check upcasts + [Opt(OptOps.UNROLL, 0, 2)], # check unroll + [Opt(OptOps.UNROLL, 0, 0)], # check full unroll of reduce with locals + [Opt(OptOps.LOCAL, 0, 4)], # check local + [Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 2)], # check combo of unroll and local + [Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UPCAST, 1, 4), Opt(OptOps.UNROLL, 0, 2)], + [Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UPCAST, 1, 4), Opt(OptOps.UNROLL, 0, 4)], + [Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UPCAST, 1, 4), Opt(OptOps.UNROLL, 0, 4), Opt(OptOps.LOCAL, 0, 2)], + [Opt(OptOps.UPCAST, 1, 4), Opt(OptOps.UPCAST, 0, 4)], # check permutations + [Opt(OptOps.UNROLL, 0, 2), Opt(OptOps.UPCAST, 0, 4)], + [Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 2), Opt(OptOps.UPCAST, 1, 4)], + [Opt(OptOps.UNROLL, 0, 2), Opt(OptOps.UPCAST, 1, 4), Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 4)], + [Opt(OptOps.LOCAL, 0, 2), Opt(OptOps.UPCAST, 1, 4), Opt(OptOps.UNROLL, 0, 2), Opt(OptOps.UPCAST, 0, 4)], + # [Opt(OptOps.GROUP, 0, 2)] # doesn't work because group_for_reduce dims become early locals (conflicting with TC) + ], apply_tc=True, atol=atol, rtol=rtol, wanna_output=[np.matmul(a.numpy(), b.numpy()).flatten() + np.matmul(c.numpy(), d.numpy()).flatten()]) + def test_padto_matmul(self): if CI and Device.DEFAULT in ["CUDA", "AMD", "NV"]: self.skipTest("super slow on CUDA and AMD because of the big grid dims") N = 17 * 17 diff --git a/test/test_uop_graph.py b/test/test_uop_graph.py index e7bed69c84..150fba6b02 100644 --- a/test/test_uop_graph.py +++ b/test/test_uop_graph.py @@ -59,7 +59,7 @@ class TestUOpGraph(unittest.TestCase): cast = g.add(UOps.CAST, dtypes.float.vec(2), (ld,)) x = g.add(UOps.GEP, dtypes.float, (cast, ), arg=0) alu = g.add(UOps.ALU, dtypes.float, (x, ), UnaryOps.SQRT) - out = g.add(UOps.STORE, dtypes.float, (d0, idx, alu), UnaryOps.SQRT) + out = g.add(UOps.STORE, dtypes.float, (d0, idx, alu)) g.add(UOps.SINK, None, (out,)) self.assertEqual(len([x for x in g.uops if x.uop is UOps.CAST]), 0) diff --git a/test/test_uops.py b/test/test_uops.py index ebd6375ce6..bc99773f2f 100644 --- a/test/test_uops.py +++ b/test/test_uops.py @@ -221,7 +221,7 @@ class TestConstantFolding(unittest.TestCase): class TestLocalAccess(unittest.TestCase): # NOTE: this is failing on METAL CI, no idea why. Works locally. @unittest.skipIf(Device.DEFAULT == "METAL" and CI, "failing only in CI") - @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_local, "test requires locals") + @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_shared, "test requires shared memory") def test_local_basic(self): uops = [] smem = uop(uops, UOps.DEFINE_LOCAL, PtrDType(dtypes.float32), (), ('smem', 16)) @@ -230,7 +230,7 @@ class TestLocalAccess(unittest.TestCase): sres = uop(uops, UOps.LOAD, dtypes.float32, (smem, uop(uops, UOps.CONST, dtypes.int32, (), 0), barr)) self.assertEqual(_test_uops_result(dtypes.float32, uops, sres), 42) - @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_local, "test requires locals") + @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_shared, "test requires shared memory") def test_local_indirect(self): uops = [] smem = uop(uops, UOps.DEFINE_LOCAL, PtrDType(dtypes.int32), (), ('smem', 16))