From a1f70ce7d0cd29deb6bbb73a2586042642e38f03 Mon Sep 17 00:00:00 2001 From: qazal <77887910+Qazalin@users.noreply.github.com> Date: Wed, 15 Jan 2025 12:34:15 -0500 Subject: [PATCH] only use BUFFER_VIEW in disk [pr] (#8629) * only use BUFFER_VIEW in disk [pr] * delete can_view * BUFFER_VIEW op on DISK * remove that allow_buffer_view=False * notes * bitcast is a low-level op too * this passes on AMD and LLVM --- test/test_schedule.py | 5 +++-- test/test_setitem.py | 10 ++-------- test/test_tensor_uop.py | 3 +-- tinygrad/ops.py | 13 +++++++------ tinygrad/tensor.py | 2 +- 5 files changed, 14 insertions(+), 19 deletions(-) diff --git a/test/test_schedule.py b/test/test_schedule.py index 2d531f0170..a0cd1b3272 100644 --- a/test/test_schedule.py +++ b/test/test_schedule.py @@ -13,7 +13,7 @@ from tinygrad.device import is_dtype_supported from tinygrad.dtype import DType, ImageDType from tinygrad.shape.shapetracker import ShapeTracker from tinygrad.shape.view import View -from tinygrad.ops import PatternMatcher, UOp, Ops, UPat, graph_rewrite, track_rewrites, view_supported_devices, symbolic_simple, merge_views +from tinygrad.ops import PatternMatcher, UOp, Ops, UPat, graph_rewrite, track_rewrites, symbolic_simple, merge_views from tinygrad.helpers import CI, DEBUG, FUSE_ARANGE, GlobalCounters, getenv, SPLIT_REDUCEOP, unwrap, prod, Context from tinygrad.codegen.kernel import verify_ast from tinygrad.engine.schedule import ScheduleItem, create_schedule_with_vars, view_right, view_left, remove_movement_ops @@ -1630,7 +1630,8 @@ class TestIndexing(unittest.TestCase): a[0] = 6 np.testing.assert_equal(a.numpy(), [6., 2., 3., 4.]) - @unittest.skipUnless(Device.DEFAULT in view_supported_devices, "need view") + #@unittest.skipUnless(Device.DEFAULT in view_supported_devices, "need view") + @unittest.skip("BUFFER_VIEW no longer supported on non-disk devices") def test_arange_view_op(self): a = Tensor.arange(12).reshape(4, 3).shrink(((1, 2), (1, 3))).contiguous() sched = self.check_schedule(a, 1) diff --git a/test/test_setitem.py b/test/test_setitem.py index 84ccb8bc1f..f1bb595ef2 100644 --- a/test/test_setitem.py +++ b/test/test_setitem.py @@ -1,6 +1,5 @@ import unittest -from tinygrad import Device, Tensor, TinyJit, Variable, dtypes -from tinygrad.helpers import CI +from tinygrad import Tensor, TinyJit, Variable, dtypes import numpy as np class TestSetitem(unittest.TestCase): @@ -139,12 +138,7 @@ class TestSetitem(unittest.TestCase): def test_setitem_overlapping_inplace1(self): t = Tensor([[3.0], [2.0], [1.0]]).contiguous() t[1:] = t[:-1] - if (Device.DEFAULT == "LLVM") or (CI and Device.DEFAULT == "AMD"): - # TODO: FIXME - with self.assertRaises(AssertionError): - self.assertEqual(t.tolist(), [[3.0], [3.0], [2.0]]) - else: - self.assertEqual(t.tolist(), [[3.0], [3.0], [2.0]]) + self.assertEqual(t.tolist(), [[3.0], [3.0], [2.0]]) def test_setitem_overlapping_inplace2(self): t = Tensor([[3.0], [2.0], [1.0]]).contiguous() diff --git a/test/test_tensor_uop.py b/test/test_tensor_uop.py index 03e1b71649..b2d4acad97 100644 --- a/test/test_tensor_uop.py +++ b/test/test_tensor_uop.py @@ -3,7 +3,7 @@ import numpy as np import unittest from tinygrad import Tensor, Device, dtypes from tinygrad.engine.realize import run_schedule -from tinygrad.ops import Ops, UOp, UPat, view_supported_devices +from tinygrad.ops import Ops, UOp, UPat class TestTensorUOp(unittest.TestCase): def test_fromcpu_shape_tracker(self): @@ -84,7 +84,6 @@ class TestTensorUOp(unittest.TestCase): sched = empty.schedule() self.assertEqual(len(sched), 0) - @unittest.skipIf(Device.DEFAULT in view_supported_devices, "BUFFER_VIEW cannot exist on a CONST") def test_contiguous_folded_alu(self): a = Tensor.empty(8, 8) # NOTE: the buffer for mul_0 late folds to just a CONST diff --git a/tinygrad/ops.py b/tinygrad/ops.py index 2c23541a64..fabd94bdfb 100644 --- a/tinygrad/ops.py +++ b/tinygrad/ops.py @@ -367,7 +367,7 @@ class UOp(MathTrait, metaclass=UOpMetaClass): raise RuntimeError(f"unsupported size in bitcast {dtype}") # shape changing bitcast can use a subbuffer on DISK # TODO: this should be moved to realize.py - if self.can_view() and self.device.startswith("DISK"): return UOp(Ops.BUFFER_VIEW, dtype, (self,)) + if self._device is not None and self.device.startswith("DISK"): return UOp(Ops.BUFFER_VIEW, dtype, (self,)) return UOp(Ops.BITCAST, dtype, (self,)) def gep(self, i:Union[tuple[int, ...], int]): if isinstance(i, int): @@ -420,9 +420,13 @@ class UOp(MathTrait, metaclass=UOpMetaClass): if DEBUG >= 3: print(f"split {divisor}: {self.shape} -> {splitted.shape} -> {new_shape}") return splitted._reduce_op(op, axis)._reduce_op(op, (len(new_shape),)).reshape(new_shape) # reduce original axes, then split def assign(self, x:UOp): return UOp(Ops.ASSIGN, self.dtype, (self,x)) - def contiguous(self, allow_buffer_view=True): + def contiguous(self): + # TODO: BUFFER_VIEW op should be deleted and subbuffer should be moved to realize.py + # NOTE: DISK uses subbuffer because DISK does not render kernels + if self.device.startswith("DISK"): return self.alu(Ops.BUFFER_VIEW) + # otherwise it's normal CONTIGUOUS if not unwrap(self.st).contiguous or self.size != self.base.size or self.base.op is Ops.CONST: - return self.alu(Ops.BUFFER_VIEW if allow_buffer_view and self.can_view() else Ops.CONTIGUOUS) + return self.alu(Ops.CONTIGUOUS) forced_realize.add(self.base) return self @@ -451,9 +455,6 @@ class UOp(MathTrait, metaclass=UOpMetaClass): return UOp(Ops.COPY, self.base.dtype, (UOp(Ops.DEVICE, arg=device), self.base), clone).view(unwrap(self.st)) def clone(self) -> UOp: return self.copy_to_device(self.device, clone=True) def is_unrealized_unmasked_const(self): return self.base.op is Ops.CONST and all(v.mask is None for v in unwrap(self.st).views) - def can_view(self): - return (self.st is not None and self._device is not None and self.st.consecutive and self.base.op is not Ops.CONST and - not isinstance(self.dtype, ImageDType) and self.device.split(":")[0] in view_supported_devices) @property def lbs(self): return [self] @property diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py index e3d87f51eb..4acc8192af 100644 --- a/tinygrad/tensor.py +++ b/tinygrad/tensor.py @@ -413,7 +413,7 @@ class Tensor(SimpleMathTrait): lbs = [cast(UOp, t.lazydata) for t in self.split(sizes, axis)] sharded_lbs = [lb.copy_to_device(d) for lb,d in zip(lbs, devices)] # NOTE: this contiguous is making it impossible for the scheduler to do late const folding - mlb = MultiLazyBuffer([lb.contiguous(allow_buffer_view=False) for lb in sharded_lbs], axis) + mlb = MultiLazyBuffer([lb.contiguous() for lb in sharded_lbs], axis) return Tensor(mlb, device=devices, requires_grad=self.requires_grad) def shard_(self, devices:tuple[str, ...], axis:Optional[int]=None):