mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-06-13 00:15:35 +08:00
only use BUFFER_VIEW in disk [pr] (#8629)
* only use BUFFER_VIEW in disk [pr] * delete can_view * BUFFER_VIEW op on DISK * remove that allow_buffer_view=False * notes * bitcast is a low-level op too * this passes on AMD and LLVM
This commit is contained in:
@@ -13,7 +13,7 @@ from tinygrad.device import is_dtype_supported
|
||||
from tinygrad.dtype import DType, ImageDType
|
||||
from tinygrad.shape.shapetracker import ShapeTracker
|
||||
from tinygrad.shape.view import View
|
||||
from tinygrad.ops import PatternMatcher, UOp, Ops, UPat, graph_rewrite, track_rewrites, view_supported_devices, symbolic_simple, merge_views
|
||||
from tinygrad.ops import PatternMatcher, UOp, Ops, UPat, graph_rewrite, track_rewrites, symbolic_simple, merge_views
|
||||
from tinygrad.helpers import CI, DEBUG, FUSE_ARANGE, GlobalCounters, getenv, SPLIT_REDUCEOP, unwrap, prod, Context
|
||||
from tinygrad.codegen.kernel import verify_ast
|
||||
from tinygrad.engine.schedule import ScheduleItem, create_schedule_with_vars, view_right, view_left, remove_movement_ops
|
||||
@@ -1630,7 +1630,8 @@ class TestIndexing(unittest.TestCase):
|
||||
a[0] = 6
|
||||
np.testing.assert_equal(a.numpy(), [6., 2., 3., 4.])
|
||||
|
||||
@unittest.skipUnless(Device.DEFAULT in view_supported_devices, "need view")
|
||||
#@unittest.skipUnless(Device.DEFAULT in view_supported_devices, "need view")
|
||||
@unittest.skip("BUFFER_VIEW no longer supported on non-disk devices")
|
||||
def test_arange_view_op(self):
|
||||
a = Tensor.arange(12).reshape(4, 3).shrink(((1, 2), (1, 3))).contiguous()
|
||||
sched = self.check_schedule(a, 1)
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
import unittest
|
||||
from tinygrad import Device, Tensor, TinyJit, Variable, dtypes
|
||||
from tinygrad.helpers import CI
|
||||
from tinygrad import Tensor, TinyJit, Variable, dtypes
|
||||
import numpy as np
|
||||
|
||||
class TestSetitem(unittest.TestCase):
|
||||
@@ -139,12 +138,7 @@ class TestSetitem(unittest.TestCase):
|
||||
def test_setitem_overlapping_inplace1(self):
|
||||
t = Tensor([[3.0], [2.0], [1.0]]).contiguous()
|
||||
t[1:] = t[:-1]
|
||||
if (Device.DEFAULT == "LLVM") or (CI and Device.DEFAULT == "AMD"):
|
||||
# TODO: FIXME
|
||||
with self.assertRaises(AssertionError):
|
||||
self.assertEqual(t.tolist(), [[3.0], [3.0], [2.0]])
|
||||
else:
|
||||
self.assertEqual(t.tolist(), [[3.0], [3.0], [2.0]])
|
||||
self.assertEqual(t.tolist(), [[3.0], [3.0], [2.0]])
|
||||
|
||||
def test_setitem_overlapping_inplace2(self):
|
||||
t = Tensor([[3.0], [2.0], [1.0]]).contiguous()
|
||||
|
||||
@@ -3,7 +3,7 @@ import numpy as np
|
||||
import unittest
|
||||
from tinygrad import Tensor, Device, dtypes
|
||||
from tinygrad.engine.realize import run_schedule
|
||||
from tinygrad.ops import Ops, UOp, UPat, view_supported_devices
|
||||
from tinygrad.ops import Ops, UOp, UPat
|
||||
|
||||
class TestTensorUOp(unittest.TestCase):
|
||||
def test_fromcpu_shape_tracker(self):
|
||||
@@ -84,7 +84,6 @@ class TestTensorUOp(unittest.TestCase):
|
||||
sched = empty.schedule()
|
||||
self.assertEqual(len(sched), 0)
|
||||
|
||||
@unittest.skipIf(Device.DEFAULT in view_supported_devices, "BUFFER_VIEW cannot exist on a CONST")
|
||||
def test_contiguous_folded_alu(self):
|
||||
a = Tensor.empty(8, 8)
|
||||
# NOTE: the buffer for mul_0 late folds to just a CONST
|
||||
|
||||
@@ -367,7 +367,7 @@ class UOp(MathTrait, metaclass=UOpMetaClass):
|
||||
raise RuntimeError(f"unsupported size in bitcast {dtype}")
|
||||
# shape changing bitcast can use a subbuffer on DISK
|
||||
# TODO: this should be moved to realize.py
|
||||
if self.can_view() and self.device.startswith("DISK"): return UOp(Ops.BUFFER_VIEW, dtype, (self,))
|
||||
if self._device is not None and self.device.startswith("DISK"): return UOp(Ops.BUFFER_VIEW, dtype, (self,))
|
||||
return UOp(Ops.BITCAST, dtype, (self,))
|
||||
def gep(self, i:Union[tuple[int, ...], int]):
|
||||
if isinstance(i, int):
|
||||
@@ -420,9 +420,13 @@ class UOp(MathTrait, metaclass=UOpMetaClass):
|
||||
if DEBUG >= 3: print(f"split {divisor}: {self.shape} -> {splitted.shape} -> {new_shape}")
|
||||
return splitted._reduce_op(op, axis)._reduce_op(op, (len(new_shape),)).reshape(new_shape) # reduce original axes, then split
|
||||
def assign(self, x:UOp): return UOp(Ops.ASSIGN, self.dtype, (self,x))
|
||||
def contiguous(self, allow_buffer_view=True):
|
||||
def contiguous(self):
|
||||
# TODO: BUFFER_VIEW op should be deleted and subbuffer should be moved to realize.py
|
||||
# NOTE: DISK uses subbuffer because DISK does not render kernels
|
||||
if self.device.startswith("DISK"): return self.alu(Ops.BUFFER_VIEW)
|
||||
# otherwise it's normal CONTIGUOUS
|
||||
if not unwrap(self.st).contiguous or self.size != self.base.size or self.base.op is Ops.CONST:
|
||||
return self.alu(Ops.BUFFER_VIEW if allow_buffer_view and self.can_view() else Ops.CONTIGUOUS)
|
||||
return self.alu(Ops.CONTIGUOUS)
|
||||
forced_realize.add(self.base)
|
||||
return self
|
||||
|
||||
@@ -451,9 +455,6 @@ class UOp(MathTrait, metaclass=UOpMetaClass):
|
||||
return UOp(Ops.COPY, self.base.dtype, (UOp(Ops.DEVICE, arg=device), self.base), clone).view(unwrap(self.st))
|
||||
def clone(self) -> UOp: return self.copy_to_device(self.device, clone=True)
|
||||
def is_unrealized_unmasked_const(self): return self.base.op is Ops.CONST and all(v.mask is None for v in unwrap(self.st).views)
|
||||
def can_view(self):
|
||||
return (self.st is not None and self._device is not None and self.st.consecutive and self.base.op is not Ops.CONST and
|
||||
not isinstance(self.dtype, ImageDType) and self.device.split(":")[0] in view_supported_devices)
|
||||
@property
|
||||
def lbs(self): return [self]
|
||||
@property
|
||||
|
||||
@@ -413,7 +413,7 @@ class Tensor(SimpleMathTrait):
|
||||
lbs = [cast(UOp, t.lazydata) for t in self.split(sizes, axis)]
|
||||
sharded_lbs = [lb.copy_to_device(d) for lb,d in zip(lbs, devices)]
|
||||
# NOTE: this contiguous is making it impossible for the scheduler to do late const folding
|
||||
mlb = MultiLazyBuffer([lb.contiguous(allow_buffer_view=False) for lb in sharded_lbs], axis)
|
||||
mlb = MultiLazyBuffer([lb.contiguous() for lb in sharded_lbs], axis)
|
||||
return Tensor(mlb, device=devices, requires_grad=self.requires_grad)
|
||||
|
||||
def shard_(self, devices:tuple[str, ...], axis:Optional[int]=None):
|
||||
|
||||
Reference in New Issue
Block a user