fix out of resource kernels on nv (#4450)

* fix out of resource kernels on nv * better comment * noqa * noqa 2 * linter
2026-06-13 00:15:35 +08:00 · 2024-05-06 19:24:20 +03:00
parent f4e49a7c1a
commit d0b8862dea
2 changed files with 34 additions and 1 deletions
--- a/test/external/external_test_nv.py
+++ b/test/external/external_test_nv.py
@@ -0,0 +1,30 @@
+import unittest
+from tinygrad import Device, dtypes, Tensor
+from tinygrad.engine.schedule import create_schedule
+from tinygrad.runtime.ops_nv import NVDevice
+from tinygrad.features.search import Opt, OptOps
+from test.test_linearizer_failures import helper_test_lin
+
+from tinygrad.codegen.linearizer import Linearizer
+from tinygrad.ops import LazyOp, BinaryOps, UnaryOps, ReduceOps, BufferOps, MemBuffer
+from tinygrad.shape.shapetracker import ShapeTracker
+from tinygrad.shape.view import View
+
+class TestNV(unittest.TestCase):
+  @classmethod
+  def setUpClass(self):
+    TestNV.d0: NVDevice = Device["NV"]
+    TestNV.a = Tensor([0.,1.], device="NV").realize()
+    TestNV.b = self.a + 1
+    si = create_schedule([self.b.lazydata])[-1]
+    TestNV.d0_runner = TestNV.d0.get_runner(*si.ast)
+    TestNV.b.lazydata.buffer.allocate()
+
+  def test_oor_kernels(self):
+    ast = LazyOp(op=BufferOps.STORE, src=(LazyOp(op=UnaryOps.CAST, src=(LazyOp(op=ReduceOps.SUM, src=(LazyOp(op=UnaryOps.CAST, src=(LazyOp(op=BinaryOps.MUL, src=(LazyOp(op=BufferOps.LOAD, src=(), arg=MemBuffer(idx=1, dtype=dtypes.half, st=ShapeTracker(views=(View(shape=(1, 256, 1, 512, 4, 16, 4, 16), strides=(0, 100352, 0, 196, 0, 14, 0, 1), offset=-15, mask=((0, 1), (0, 256), (0, 1), (0, 512), (0, 4), (1, 15), (0, 4), (1, 15)), contiguous=False), View(shape=(256, 1, 512, 7, 7, 512, 3, 3), strides=(2097152, 0, 0, 128, 2, 4096, 1088, 17), offset=0, mask=None, contiguous=False))))), LazyOp(op=BufferOps.LOAD, src=(), arg=MemBuffer(idx=2, dtype=dtypes.half, st=ShapeTracker(views=(View(shape=(256, 1, 512, 7, 7, 512, 3, 3), strides=(25088, 0, 49, 7, 1, 0, 0, 0), offset=0, mask=None, contiguous=False),))))), arg=None),), arg=(dtypes.float, False)),), arg=((0, 3, 4), dtypes.float)),), arg=(dtypes.half, False)),), arg=MemBuffer(idx=0, dtype=dtypes.half, st=ShapeTracker(views=(View(shape=(1, 1, 512, 1, 1, 512, 3, 3), strides=(0, 0, 4608, 0, 0, 9, 3, 1), offset=0, mask=None, contiguous=True),)))) # noqa: E501
+    opts = [Opt(op=OptOps.TC, axis=6, amt=2), Opt(op=OptOps.UPCAST, axis=0, amt=4), Opt(op=OptOps.UPCAST, axis=3, amt=0), Opt(op=OptOps.LOCAL, axis=1, amt=4), Opt(op=OptOps.LOCAL, axis=2, amt=3), Opt(op=OptOps.UPCAST, axis=1, amt=2)] # noqa: E501
+    helper_test_lin(Linearizer(ast), opts=opts, failed_platforms=["NV"])
+
+if __name__ == "__main__":
+  unittest.main()
+
--- a/tinygrad/runtime/ops_nv.py
+++ b/tinygrad/runtime/ops_nv.py
@@ -188,6 +188,9 @@ class NVProgram:
          if section_data[i * 3 + 0] & 0xffff == 0x1204 and section_data[i * 3 + 2] + 0x240 > self.device.slm_per_thread:
            raise RuntimeError("too high local memory")

+    # Registers allocation granularity per warp is 256, warp allocaiton granularity is 4. Register file size is 65536.
+    self.max_threads = ((65536 // round_up(self.registers_usage * 32, 256)) // 4) * 4 * 32
+
    # Load program and constant buffers (if any)
    self.lib_sz = round_up(round_up(self.program.nbytes, 128) + sum([round_up(x.nbytes, 128) for i,x in constant_buffers_data.items()]), 0x1000)
    self.lib_gpu = self.device.allocator.alloc(self.lib_sz)
@@ -226,7 +229,7 @@ class NVProgram:
    if hasattr(self, 'lib_gpu'): self.device.allocator.free(self.lib_gpu, self.lib_sz)

  def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
-    if prod(local_size) > 1024 or self.registers_usage * prod(local_size) > 65536: raise RuntimeError("Too many resources requsted for launch")
+    if prod(local_size) > 1024 or self.max_threads < prod(local_size): raise RuntimeError("Too many resources requsted for launch")

    kernargs_size = round_up(QMD_SIZE + 0x160 + len(args) * 8 + len(vals) * 4, 1 << 8)
    if self.device.kernargs_ptr >= (self.device.kernargs_page.base + self.device.kernargs_page.length - kernargs_size):