only use BUFFER_VIEW in disk [pr] (#8629)

* only use BUFFER_VIEW in disk [pr]

* delete can_view

* BUFFER_VIEW op on DISK

* remove that allow_buffer_view=False

* notes

* bitcast is a low-level op too

* this passes on AMD and LLVM
This commit is contained in:
qazal
2025-01-15 12:34:15 -05:00
committed by GitHub
parent bae20e5043
commit a1f70ce7d0
5 changed files with 14 additions and 19 deletions

View File

@@ -13,7 +13,7 @@ from tinygrad.device import is_dtype_supported
from tinygrad.dtype import DType, ImageDType
from tinygrad.shape.shapetracker import ShapeTracker
from tinygrad.shape.view import View
from tinygrad.ops import PatternMatcher, UOp, Ops, UPat, graph_rewrite, track_rewrites, view_supported_devices, symbolic_simple, merge_views
from tinygrad.ops import PatternMatcher, UOp, Ops, UPat, graph_rewrite, track_rewrites, symbolic_simple, merge_views
from tinygrad.helpers import CI, DEBUG, FUSE_ARANGE, GlobalCounters, getenv, SPLIT_REDUCEOP, unwrap, prod, Context
from tinygrad.codegen.kernel import verify_ast
from tinygrad.engine.schedule import ScheduleItem, create_schedule_with_vars, view_right, view_left, remove_movement_ops
@@ -1630,7 +1630,8 @@ class TestIndexing(unittest.TestCase):
a[0] = 6
np.testing.assert_equal(a.numpy(), [6., 2., 3., 4.])
@unittest.skipUnless(Device.DEFAULT in view_supported_devices, "need view")
#@unittest.skipUnless(Device.DEFAULT in view_supported_devices, "need view")
@unittest.skip("BUFFER_VIEW no longer supported on non-disk devices")
def test_arange_view_op(self):
a = Tensor.arange(12).reshape(4, 3).shrink(((1, 2), (1, 3))).contiguous()
sched = self.check_schedule(a, 1)

View File

@@ -1,6 +1,5 @@
import unittest
from tinygrad import Device, Tensor, TinyJit, Variable, dtypes
from tinygrad.helpers import CI
from tinygrad import Tensor, TinyJit, Variable, dtypes
import numpy as np
class TestSetitem(unittest.TestCase):
@@ -139,12 +138,7 @@ class TestSetitem(unittest.TestCase):
def test_setitem_overlapping_inplace1(self):
t = Tensor([[3.0], [2.0], [1.0]]).contiguous()
t[1:] = t[:-1]
if (Device.DEFAULT == "LLVM") or (CI and Device.DEFAULT == "AMD"):
# TODO: FIXME
with self.assertRaises(AssertionError):
self.assertEqual(t.tolist(), [[3.0], [3.0], [2.0]])
else:
self.assertEqual(t.tolist(), [[3.0], [3.0], [2.0]])
self.assertEqual(t.tolist(), [[3.0], [3.0], [2.0]])
def test_setitem_overlapping_inplace2(self):
t = Tensor([[3.0], [2.0], [1.0]]).contiguous()

View File

@@ -3,7 +3,7 @@ import numpy as np
import unittest
from tinygrad import Tensor, Device, dtypes
from tinygrad.engine.realize import run_schedule
from tinygrad.ops import Ops, UOp, UPat, view_supported_devices
from tinygrad.ops import Ops, UOp, UPat
class TestTensorUOp(unittest.TestCase):
def test_fromcpu_shape_tracker(self):
@@ -84,7 +84,6 @@ class TestTensorUOp(unittest.TestCase):
sched = empty.schedule()
self.assertEqual(len(sched), 0)
@unittest.skipIf(Device.DEFAULT in view_supported_devices, "BUFFER_VIEW cannot exist on a CONST")
def test_contiguous_folded_alu(self):
a = Tensor.empty(8, 8)
# NOTE: the buffer for mul_0 late folds to just a CONST

View File

@@ -367,7 +367,7 @@ class UOp(MathTrait, metaclass=UOpMetaClass):
raise RuntimeError(f"unsupported size in bitcast {dtype}")
# shape changing bitcast can use a subbuffer on DISK
# TODO: this should be moved to realize.py
if self.can_view() and self.device.startswith("DISK"): return UOp(Ops.BUFFER_VIEW, dtype, (self,))
if self._device is not None and self.device.startswith("DISK"): return UOp(Ops.BUFFER_VIEW, dtype, (self,))
return UOp(Ops.BITCAST, dtype, (self,))
def gep(self, i:Union[tuple[int, ...], int]):
if isinstance(i, int):
@@ -420,9 +420,13 @@ class UOp(MathTrait, metaclass=UOpMetaClass):
if DEBUG >= 3: print(f"split {divisor}: {self.shape} -> {splitted.shape} -> {new_shape}")
return splitted._reduce_op(op, axis)._reduce_op(op, (len(new_shape),)).reshape(new_shape) # reduce original axes, then split
def assign(self, x:UOp): return UOp(Ops.ASSIGN, self.dtype, (self,x))
def contiguous(self, allow_buffer_view=True):
def contiguous(self):
# TODO: BUFFER_VIEW op should be deleted and subbuffer should be moved to realize.py
# NOTE: DISK uses subbuffer because DISK does not render kernels
if self.device.startswith("DISK"): return self.alu(Ops.BUFFER_VIEW)
# otherwise it's normal CONTIGUOUS
if not unwrap(self.st).contiguous or self.size != self.base.size or self.base.op is Ops.CONST:
return self.alu(Ops.BUFFER_VIEW if allow_buffer_view and self.can_view() else Ops.CONTIGUOUS)
return self.alu(Ops.CONTIGUOUS)
forced_realize.add(self.base)
return self
@@ -451,9 +455,6 @@ class UOp(MathTrait, metaclass=UOpMetaClass):
return UOp(Ops.COPY, self.base.dtype, (UOp(Ops.DEVICE, arg=device), self.base), clone).view(unwrap(self.st))
def clone(self) -> UOp: return self.copy_to_device(self.device, clone=True)
def is_unrealized_unmasked_const(self): return self.base.op is Ops.CONST and all(v.mask is None for v in unwrap(self.st).views)
def can_view(self):
return (self.st is not None and self._device is not None and self.st.consecutive and self.base.op is not Ops.CONST and
not isinstance(self.dtype, ImageDType) and self.device.split(":")[0] in view_supported_devices)
@property
def lbs(self): return [self]
@property

View File

@@ -413,7 +413,7 @@ class Tensor(SimpleMathTrait):
lbs = [cast(UOp, t.lazydata) for t in self.split(sizes, axis)]
sharded_lbs = [lb.copy_to_device(d) for lb,d in zip(lbs, devices)]
# NOTE: this contiguous is making it impossible for the scheduler to do late const folding
mlb = MultiLazyBuffer([lb.contiguous(allow_buffer_view=False) for lb in sharded_lbs], axis)
mlb = MultiLazyBuffer([lb.contiguous() for lb in sharded_lbs], axis)
return Tensor(mlb, device=devices, requires_grad=self.requires_grad)
def shard_(self, devices:tuple[str, ...], axis:Optional[int]=None):