From a1f70ce7d0cd29deb6bbb73a2586042642e38f03 Mon Sep 17 00:00:00 2001
From: qazal <77887910+Qazalin@users.noreply.github.com>
Date: Wed, 15 Jan 2025 12:34:15 -0500
Subject: [PATCH] only use BUFFER_VIEW in disk [pr] (#8629)

* only use BUFFER_VIEW in disk [pr]

* delete can_view

* BUFFER_VIEW op on DISK

* remove that allow_buffer_view=False

* notes

* bitcast is a low-level op too

* this passes on AMD and LLVM
---
 test/test_schedule.py   |  5 +++--
 test/test_setitem.py    | 10 ++--------
 test/test_tensor_uop.py |  3 +--
 tinygrad/ops.py         | 13 +++++++------
 tinygrad/tensor.py      |  2 +-
 5 files changed, 14 insertions(+), 19 deletions(-)

diff --git a/test/test_schedule.py b/test/test_schedule.py
index 2d531f0170..a0cd1b3272 100644
--- a/test/test_schedule.py
+++ b/test/test_schedule.py
@@ -13,7 +13,7 @@ from tinygrad.device import is_dtype_supported
 from tinygrad.dtype import DType, ImageDType
 from tinygrad.shape.shapetracker import ShapeTracker
 from tinygrad.shape.view import View
-from tinygrad.ops import PatternMatcher, UOp, Ops, UPat, graph_rewrite, track_rewrites, view_supported_devices, symbolic_simple, merge_views
+from tinygrad.ops import PatternMatcher, UOp, Ops, UPat, graph_rewrite, track_rewrites, symbolic_simple, merge_views
 from tinygrad.helpers import CI, DEBUG, FUSE_ARANGE, GlobalCounters, getenv, SPLIT_REDUCEOP, unwrap, prod, Context
 from tinygrad.codegen.kernel import verify_ast
 from tinygrad.engine.schedule import ScheduleItem, create_schedule_with_vars, view_right, view_left, remove_movement_ops
@@ -1630,7 +1630,8 @@ class TestIndexing(unittest.TestCase):
     a[0] = 6
     np.testing.assert_equal(a.numpy(), [6., 2., 3., 4.])
 
-  @unittest.skipUnless(Device.DEFAULT in view_supported_devices, "need view")
+  #@unittest.skipUnless(Device.DEFAULT in view_supported_devices, "need view")
+  @unittest.skip("BUFFER_VIEW no longer supported on non-disk devices")
   def test_arange_view_op(self):
     a = Tensor.arange(12).reshape(4, 3).shrink(((1, 2), (1, 3))).contiguous()
     sched = self.check_schedule(a, 1)
diff --git a/test/test_setitem.py b/test/test_setitem.py
index 84ccb8bc1f..f1bb595ef2 100644
--- a/test/test_setitem.py
+++ b/test/test_setitem.py
@@ -1,6 +1,5 @@
 import unittest
-from tinygrad import Device, Tensor, TinyJit, Variable, dtypes
-from tinygrad.helpers import CI
+from tinygrad import Tensor, TinyJit, Variable, dtypes
 import numpy as np
 
 class TestSetitem(unittest.TestCase):
@@ -139,12 +138,7 @@ class TestSetitem(unittest.TestCase):
   def test_setitem_overlapping_inplace1(self):
     t = Tensor([[3.0], [2.0], [1.0]]).contiguous()
     t[1:] = t[:-1]
-    if (Device.DEFAULT == "LLVM") or (CI and Device.DEFAULT == "AMD"):
-      # TODO: FIXME
-      with self.assertRaises(AssertionError):
-        self.assertEqual(t.tolist(), [[3.0], [3.0], [2.0]])
-    else:
-      self.assertEqual(t.tolist(), [[3.0], [3.0], [2.0]])
+    self.assertEqual(t.tolist(), [[3.0], [3.0], [2.0]])
 
   def test_setitem_overlapping_inplace2(self):
     t = Tensor([[3.0], [2.0], [1.0]]).contiguous()
diff --git a/test/test_tensor_uop.py b/test/test_tensor_uop.py
index 03e1b71649..b2d4acad97 100644
--- a/test/test_tensor_uop.py
+++ b/test/test_tensor_uop.py
@@ -3,7 +3,7 @@ import numpy as np
 import unittest
 from tinygrad import Tensor, Device, dtypes
 from tinygrad.engine.realize import run_schedule
-from tinygrad.ops import Ops, UOp, UPat, view_supported_devices
+from tinygrad.ops import Ops, UOp, UPat
 
 class TestTensorUOp(unittest.TestCase):
   def test_fromcpu_shape_tracker(self):
@@ -84,7 +84,6 @@ class TestTensorUOp(unittest.TestCase):
     sched = empty.schedule()
     self.assertEqual(len(sched), 0)
 
-  @unittest.skipIf(Device.DEFAULT in view_supported_devices, "BUFFER_VIEW cannot exist on a CONST")
   def test_contiguous_folded_alu(self):
     a = Tensor.empty(8, 8)
     # NOTE: the buffer for mul_0 late folds to just a CONST
diff --git a/tinygrad/ops.py b/tinygrad/ops.py
index 2c23541a64..fabd94bdfb 100644
--- a/tinygrad/ops.py
+++ b/tinygrad/ops.py
@@ -367,7 +367,7 @@ class UOp(MathTrait, metaclass=UOpMetaClass):
       raise RuntimeError(f"unsupported size in bitcast {dtype}")
     # shape changing bitcast can use a subbuffer on DISK
     # TODO: this should be moved to realize.py
-    if self.can_view() and self.device.startswith("DISK"): return UOp(Ops.BUFFER_VIEW, dtype, (self,))
+    if self._device is not None and self.device.startswith("DISK"): return UOp(Ops.BUFFER_VIEW, dtype, (self,))
     return UOp(Ops.BITCAST, dtype, (self,))
   def gep(self, i:Union[tuple[int, ...], int]):
     if isinstance(i, int):
@@ -420,9 +420,13 @@ class UOp(MathTrait, metaclass=UOpMetaClass):
     if DEBUG >= 3: print(f"split {divisor}: {self.shape} -> {splitted.shape} -> {new_shape}")
     return splitted._reduce_op(op, axis)._reduce_op(op, (len(new_shape),)).reshape(new_shape)  # reduce original axes, then split
   def assign(self, x:UOp): return UOp(Ops.ASSIGN, self.dtype, (self,x))
-  def contiguous(self, allow_buffer_view=True):
+  def contiguous(self):
+    # TODO: BUFFER_VIEW op should be deleted and subbuffer should be moved to realize.py
+    # NOTE: DISK uses subbuffer because DISK does not render kernels
+    if self.device.startswith("DISK"): return self.alu(Ops.BUFFER_VIEW)
+    # otherwise it's normal CONTIGUOUS
     if not unwrap(self.st).contiguous or self.size != self.base.size or self.base.op is Ops.CONST:
-      return self.alu(Ops.BUFFER_VIEW if allow_buffer_view and self.can_view() else Ops.CONTIGUOUS)
+      return self.alu(Ops.CONTIGUOUS)
     forced_realize.add(self.base)
     return self
 
@@ -451,9 +455,6 @@ class UOp(MathTrait, metaclass=UOpMetaClass):
     return UOp(Ops.COPY, self.base.dtype, (UOp(Ops.DEVICE, arg=device), self.base), clone).view(unwrap(self.st))
   def clone(self) -> UOp: return self.copy_to_device(self.device, clone=True)
   def is_unrealized_unmasked_const(self): return self.base.op is Ops.CONST and all(v.mask is None for v in unwrap(self.st).views)
-  def can_view(self):
-    return (self.st is not None and self._device is not None and self.st.consecutive and self.base.op is not Ops.CONST and
-            not isinstance(self.dtype, ImageDType) and self.device.split(":")[0] in view_supported_devices)
   @property
   def lbs(self): return [self]
   @property
diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py
index e3d87f51eb..4acc8192af 100644
--- a/tinygrad/tensor.py
+++ b/tinygrad/tensor.py
@@ -413,7 +413,7 @@ class Tensor(SimpleMathTrait):
       lbs = [cast(UOp, t.lazydata) for t in self.split(sizes, axis)]
     sharded_lbs = [lb.copy_to_device(d) for lb,d in zip(lbs, devices)]
     # NOTE: this contiguous is making it impossible for the scheduler to do late const folding
-    mlb = MultiLazyBuffer([lb.contiguous(allow_buffer_view=False) for lb in sharded_lbs], axis)
+    mlb = MultiLazyBuffer([lb.contiguous() for lb in sharded_lbs], axis)
     return Tensor(mlb, device=devices, requires_grad=self.requires_grad)
 
   def shard_(self, devices:tuple[str, ...], axis:Optional[int]=None):