diff --git a/test/external/external_test_hcq.py b/test/external/external_test_hcq.py index 6d3c7045ae..8cd7a74463 100644 --- a/test/external/external_test_hcq.py +++ b/test/external/external_test_hcq.py @@ -22,7 +22,7 @@ class TestHCQ(unittest.TestCase): TestHCQ.a = Tensor([0.,1.], device=Device.DEFAULT).realize() TestHCQ.b = self.a + 1 si = create_schedule([self.b.lazydata])[-1] - TestHCQ.runner = get_runner(TestHCQ.d0.dname, si.ast) + TestHCQ.runner = get_runner(TestHCQ.d0.device, si.ast) TestHCQ.b.lazydata.buffer.allocate() # wow that's a lot of abstraction layers TestHCQ.addr = struct.pack("QQ", TestHCQ.b.lazydata.buffer._buf.va_addr, TestHCQ.a.lazydata.buffer._buf.va_addr) diff --git a/test/external/external_test_nv.py b/test/external/external_test_nv.py index 58e556b44e..87d3d682fe 100644 --- a/test/external/external_test_nv.py +++ b/test/external/external_test_nv.py @@ -21,7 +21,7 @@ class TestNV(unittest.TestCase): TestNV.a = Tensor([0.,1.], device="NV").realize() TestNV.b = self.a + 1 si = create_schedule([self.b.lazydata])[-1] - TestNV.d0_runner = get_runner(TestNV.d0.dname, si.ast) + TestNV.d0_runner = get_runner(TestNV.d0.device, si.ast) TestNV.b.lazydata.buffer.allocate() TestNV.addr = struct.pack("QQ", TestNV.b.lazydata.buffer._buf.va_addr, TestNV.a.lazydata.buffer._buf.va_addr) @@ -44,7 +44,7 @@ class TestNV(unittest.TestCase): def test_buf4_usage(self): TestNV.along = Tensor([105615], device="NV").realize() ast = LazyOp(op=BufferOps.STORE, src=(LazyOp(op=Ops.SIN, src=(LazyOp(op=Ops.CAST, src=(LazyOp(op=BufferOps.LOAD, src=(), arg=MemBuffer(idx=1, dtype=dtypes.ulong, st=ShapeTracker(views=(View(shape=(3,), strides=(1,), offset=0, mask=None, contiguous=True),)))),), arg=dtypes.float),), arg=None),), arg=MemBuffer(idx=0, dtype=dtypes.float, st=ShapeTracker(views=(View(shape=(3,), strides=(1,), offset=0, mask=None, contiguous=True),)))) # noqa: E501 - temp_runner = get_runner(TestNV.d0.dname, (ast,)) + temp_runner = get_runner(TestNV.d0.device, (ast,)) temp_runner([TestNV.b.lazydata.buffer, TestNV.along.lazydata.buffer], var_vals={}) val = TestNV.b.lazydata.buffer.as_buffer().cast("f")[0] assert abs(val - 0.80647) < 0.001, f"got val {val}" diff --git a/test/external/fuzz_linearizer.py b/test/external/fuzz_linearizer.py index 49199c11c5..6245f3b5d6 100644 --- a/test/external/fuzz_linearizer.py +++ b/test/external/fuzz_linearizer.py @@ -228,7 +228,7 @@ def fuzz_linearizer(lin: Kernel, rtol=1e-2, atol=1e-2, opts_list=None): validate_lin = test_lin.copy() validate_lin.opts = validate_device.renderer if validate_rawbufs is None: - validate_rawbufs = [get_fuzz_rawbuf_like(x, copy=True, force_device=validate_device.dname) for x in rawbufs] + validate_rawbufs = [get_fuzz_rawbuf_like(x, copy=True, force_device=validate_device.device) for x in rawbufs] (_msg, _, _, _, state2) = compare_linearizer(validate_lin, validate_rawbufs, var_vals, ground_truth, rtol=rtol, atol=atol) if _msg != "PASS": failures[f"VALIDATE_DEV_{_msg}"].append((validate_lin.ast, validate_lin.applied_opts)) diff --git a/test/test_hcq.py b/test/test_hcq.py index d6d3b97802..5c04fb2b26 100644 --- a/test/test_hcq.py +++ b/test/test_hcq.py @@ -18,7 +18,7 @@ class TestHCQ(unittest.TestCase): TestHCQ.b = self.a + 1 si = create_schedule([self.b.lazydata])[-1] - TestHCQ.runner = get_runner(TestHCQ.d0.dname, si.ast) + TestHCQ.runner = get_runner(TestHCQ.d0.device, si.ast) TestHCQ.b.lazydata.buffer.allocate() TestHCQ.kernargs_ba_ptr = TestHCQ.runner.clprg.fill_kernargs([TestHCQ.b.lazydata.buffer._buf, TestHCQ.a.lazydata.buffer._buf]) @@ -426,7 +426,7 @@ class TestHCQ(unittest.TestCase): def test_memory_barrier(self): a = Tensor([0, 1], device=Device.DEFAULT, dtype=dtypes.int8).realize() b = a + 1 - runner = get_runner(TestHCQ.d0.dname, create_schedule([b.lazydata])[-1].ast) + runner = get_runner(TestHCQ.d0.device, create_schedule([b.lazydata])[-1].ast) buf1 = Buffer(Device.DEFAULT, 2, dtypes.int8, options=BufferOptions(nolru=True)).ensure_allocated() buf2 = Buffer(Device.DEFAULT, 2, dtypes.int8, options=BufferOptions(cpu_access=True, nolru=True)).ensure_allocated() diff --git a/test/test_profiler.py b/test/test_profiler.py index bd2deffa66..dff7676507 100644 --- a/test/test_profiler.py +++ b/test/test_profiler.py @@ -79,7 +79,7 @@ class TestProfiler(unittest.TestCase): TestProfiler.b = self.a + 1 si = create_schedule([self.b.lazydata])[-1] - TestProfiler.runner = get_runner(TestProfiler.d0.dname, si.ast) + TestProfiler.runner = get_runner(TestProfiler.d0.device, si.ast) TestProfiler.b.lazydata.buffer.allocate() TestProfiler.kernargs_ba_ptr = TestProfiler.runner.clprg.fill_kernargs([TestProfiler.b.lazydata.buffer._buf, TestProfiler.a.lazydata.buffer._buf]) @@ -155,9 +155,9 @@ class TestProfiler(unittest.TestCase): def f(a): x = (a + 1).realize() - return x, x.to(d1.dname).realize() + return x, x.to(d1.device).realize() - a = Tensor.randn(10, 10, device=TestProfiler.d0.dname).realize() + a = Tensor.randn(10, 10, device=TestProfiler.d0.device).realize() with helper_collect_profile(TestProfiler.d0, d1) as profile: jf = TinyJit(f) for _ in range(3): jf(a) @@ -176,9 +176,9 @@ class TestProfiler(unittest.TestCase): def f(a): x = (a + 1).realize() - return x, x.to(d1.dname).realize() + return x, x.to(d1.device).realize() - a = Tensor.randn(10, 10, device=TestProfiler.d0.dname).realize() + a = Tensor.randn(10, 10, device=TestProfiler.d0.device).realize() with helper_collect_profile(TestProfiler.d0, d1) as profile: jf = TinyJit(f) for _ in range(3): diff --git a/tinygrad/device.py b/tinygrad/device.py index 97d70fd0d6..fdaf63a8fb 100644 --- a/tinygrad/device.py +++ b/tinygrad/device.py @@ -30,7 +30,7 @@ class _Device: def default(self) -> Compiled: return self[self.DEFAULT] def get_available_devices(self) -> Iterator[str]: for device in ["METAL", "AMD", "NV", "CUDA", "QCOM", "GPU", "CLANG", "LLVM"]: - with contextlib.suppress(Exception): yield self[device].dname + with contextlib.suppress(Exception): yield self[device].device @functools.cached_property def DEFAULT(self) -> str: if (from_env:=next((d for d in self._devices if d not in ["DISK", "NPY"] and getenv(d) == 1), None)): return from_env @@ -194,7 +194,7 @@ class Compiler: class Compiled: def __init__(self, device:str, allocator:Allocator, renderer:Optional[Renderer], compiler:Optional[Compiler], runtime, graph=None): - self.dname, self.allocator, self.compiler, self.runtime, self.graph = device, allocator, compiler or Compiler(), runtime, graph + self.device, self.allocator, self.compiler, self.runtime, self.graph = device, allocator, compiler or Compiler(), runtime, graph self.renderer = renderer or Renderer() def synchronize(self): """ diff --git a/tinygrad/runtime/graph/hcq.py b/tinygrad/runtime/graph/hcq.py index 7f2e5e602a..7b29eed0ca 100644 --- a/tinygrad/runtime/graph/hcq.py +++ b/tinygrad/runtime/graph/hcq.py @@ -76,7 +76,7 @@ class HCQGraph(MultiGraphRunner): # Remove self-dependency for compute and copy queues. # For compute, in case of NV, optimize when only 1 same-queue dependency exists, since NV chains 2+ executions in this case, # eliminating dependency need. - dname = enqueue_dev.dname.split(":", 1)[0] + dname = enqueue_dev.device.split(":", 1)[0] can_opt = dname in {"AMD", "QCOM"} or (dname == "NV" and len(sync_signals) == 0 and len(opt_deps) == 1 and id(opt_deps[0][0]) == id(out_signal)) if can_opt or isinstance(ji.prg, BufferXfer): opt_deps = [x for x in opt_deps if id(x[0]) != id(out_signal)] diff --git a/tinygrad/runtime/graph/metal.py b/tinygrad/runtime/graph/metal.py index ce167f2ca6..42df75ae5f 100644 --- a/tinygrad/runtime/graph/metal.py +++ b/tinygrad/runtime/graph/metal.py @@ -28,7 +28,7 @@ class MetalGraph(GraphRunner): msg(icb_descriptor, "setInheritPipelineState:", False) msg(icb_descriptor, "setMaxKernelBufferBindCount:", 31) - self.icb = msg(self.dev.device, "newIndirectCommandBufferWithDescriptor:maxCommandCount:options:", + self.icb = msg(self.dev.sysdevice, "newIndirectCommandBufferWithDescriptor:maxCommandCount:options:", icb_descriptor, len(self.jit_cache), MTLResourceOptions.MTLResourceCPUCacheModeDefaultCache, restype=objc_instance) if self.icb.value is None: raise GraphException("create indirect command buffer failed, does your system support this?") icb_label = bytes(msg(msg(self.icb, "description", restype=objc_instance), "UTF8String", restype=ctypes.c_char_p)).decode() diff --git a/tinygrad/runtime/ops_disk.py b/tinygrad/runtime/ops_disk.py index 916bbf8b5f..486880e5dc 100644 --- a/tinygrad/runtime/ops_disk.py +++ b/tinygrad/runtime/ops_disk.py @@ -81,7 +81,7 @@ class DiskDevice(Compiled): self.count += 1 assert self.size is None or size <= self.size, f"can't reopen Disk tensor with larger size, opened with {self.size}, tried to open with {size}" if self.size is not None: return - filename = self.dname[len("disk:"):] + filename = self.device[len("disk:"):] self.size = size if sys.platform != "win32" and filename.startswith("shm:"): diff --git a/tinygrad/runtime/ops_metal.py b/tinygrad/runtime/ops_metal.py index e31c815c28..a15ff8ed59 100644 --- a/tinygrad/runtime/ops_metal.py +++ b/tinygrad/runtime/ops_metal.py @@ -59,7 +59,7 @@ def error_check(error: objc_instance, error_constructor: type[Exception] = Runti def metal_src_to_library(device:MetalDevice, src:str) -> objc_instance: options = msg(libobjc.objc_getClass(b"MTLCompileOptions"), "new", restype=objc_instance) msg(options, "setFastMathEnabled:", getenv("METAL_FAST_MATH")) - library = msg(device.device, "newLibraryWithSource:options:error:", to_ns_str(src), options, + library = msg(device.sysdevice, "newLibraryWithSource:options:error:", to_ns_str(src), options, ctypes.byref(compileError:=objc_instance()), restype=objc_instance) error_check(compileError, CompileError) return library @@ -93,7 +93,7 @@ class MetalProgram: # binary metal library data = libdispatch.dispatch_data_create(lib, len(lib), None, None) error_library_creation = objc_instance() - self.library = msg(self.dev.device, "newLibraryWithData:error:", data, ctypes.byref(error_library_creation), restype=objc_instance) + self.library = msg(self.dev.sysdevice, "newLibraryWithData:error:", data, ctypes.byref(error_library_creation), restype=objc_instance) error_check(error_library_creation) else: # metal source. rely on OS caching @@ -103,7 +103,7 @@ class MetalProgram: descriptor = msg(libobjc.objc_getClass(b"MTLComputePipelineDescriptor"), "new", restype=objc_instance) msg(descriptor, "setComputeFunction:", self.fxn) msg(descriptor, "setSupportIndirectCommandBuffers:", True) - self.pipeline_state = msg(self.dev.device, "newComputePipelineStateWithDescriptor:options:reflection:error:", + self.pipeline_state = msg(self.dev.sysdevice, "newComputePipelineStateWithDescriptor:options:reflection:error:", descriptor, MTLPipelineOption.MTLPipelineOptionNone, None, ctypes.byref(error_pipeline_creation:=objc_instance()), restype=objc_instance) error_check(error_pipeline_creation) @@ -130,12 +130,12 @@ class MetalBuffer: def __init__(self, buf:Any, size:int, offset=0): self.buf, self.size, self.offset = buf, size, offset class MetalAllocator(LRUAllocator): - def __init__(self, device:MetalDevice): - self.dev:MetalDevice = device + def __init__(self, dev:MetalDevice): + self.dev:MetalDevice = dev super().__init__() def _alloc(self, size:int, options) -> MetalBuffer: # Buffer is explicitly released in _free() rather than garbage collected via reference count - ret = msg(self.dev.device, "newBufferWithLength:options:", size, MTLResourceOptions.MTLResourceStorageModeShared, restype=objc_id) + ret = msg(self.dev.sysdevice, "newBufferWithLength:options:", size, MTLResourceOptions.MTLResourceStorageModeShared, restype=objc_id) if ret.value is None: raise MemoryError(f"Metal OOM while allocating {size=}") return MetalBuffer(ret, size) def _free(self, opaque:MetalBuffer, options): msg(opaque.buf, "release") @@ -166,12 +166,12 @@ class MetalAllocator(LRUAllocator): class MetalDevice(Compiled): def __init__(self, device:str): - self.device = libmetal.MTLCreateSystemDefaultDevice() - self.mtl_queue = msg(self.device, "newCommandQueueWithMaxCommandBufferCount:", 1024, restype=objc_instance) + self.sysdevice = libmetal.MTLCreateSystemDefaultDevice() + self.mtl_queue = msg(self.sysdevice, "newCommandQueueWithMaxCommandBufferCount:", 1024, restype=objc_instance) if self.mtl_queue is None: raise RuntimeError("Cannot allocate a new command queue") self.mtl_buffers_in_flight: List[Any] = [] self.mv_in_metal: List[memoryview] = [] - self.timeline_signal = msg(self.device, "newSharedEvent", restype=objc_instance) + self.timeline_signal = msg(self.sysdevice, "newSharedEvent", restype=objc_instance) self.timeline_value = 0 from tinygrad.runtime.graph.metal import MetalGraph diff --git a/tinygrad/runtime/ops_nv.py b/tinygrad/runtime/ops_nv.py index de84f0f5c1..fade7cba32 100644 --- a/tinygrad/runtime/ops_nv.py +++ b/tinygrad/runtime/ops_nv.py @@ -329,7 +329,7 @@ class NVDevice(HCQCompiled): def _gpu_map_to_cpu(self, memory_handle, size, target=None, flags=0, system=False): fd_dev = self._new_gpu_fd() if not system else os.open("/dev/nvidiactl", os.O_RDWR | os.O_CLOEXEC) made = nv_gpu.nv_ioctl_nvos33_parameters_with_fd(fd=fd_dev, - params=nv_gpu.NVOS33_PARAMETERS(hClient=self.root, hDevice=self.device, hMemory=memory_handle, length=size, flags=flags)) + params=nv_gpu.NVOS33_PARAMETERS(hClient=self.root, hDevice=self.nvdevice, hMemory=memory_handle, length=size, flags=flags)) nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_MAP_MEMORY, made) if made.params.status != 0: raise RuntimeError(f"_gpu_map_to_cpu returned {get_error_str(made.params.status)}") res = libc.mmap(target, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED | (MAP_FIXED if target is not None else 0), fd_dev, 0) @@ -345,7 +345,7 @@ class NVDevice(HCQCompiled): ((nv_gpu.NVOS32_ATTR2_PAGE_SIZE_HUGE_2MB << 20) if huge_page else 0)), flags=(nv_gpu.NVOS32_ALLOC_FLAGS_ALIGNMENT_FORCE | nv_gpu.NVOS32_ALLOC_FLAGS_PERSISTENT_VIDMEM | nv_gpu.NVOS32_ALLOC_FLAGS_MAP_NOT_REQUIRED | nv_gpu.NVOS32_ALLOC_FLAGS_IGNORE_BANK_PLACEMENT | nv_gpu.NVOS32_ALLOC_FLAGS_MEMORY_HANDLE_PROVIDED)) - mem_handle = rm_alloc(self.fd_ctl, nv_gpu.NV1_MEMORY_USER, self.root, self.device, alloc_params).hObjectNew + mem_handle = rm_alloc(self.fd_ctl, nv_gpu.NV1_MEMORY_USER, self.root, self.nvdevice, alloc_params).hObjectNew if va_addr is None: va_addr = self._alloc_gpu_vaddr(size, alignment=align, force_low=map_to_cpu) if map_to_cpu: va_addr = self._gpu_map_to_cpu(mem_handle, size, target=va_addr, flags=map_flags) @@ -357,7 +357,7 @@ class NVDevice(HCQCompiled): attr2=(nv_gpu.NVOS32_ATTR2_ZBC_PREFER_NO_ZBC << 0) | (nv_gpu.NVOS32_ATTR2_GPU_CACHEABLE_NO << 2), flags=(nv_gpu.NVOS32_ALLOC_FLAGS_IGNORE_BANK_PLACEMENT | nv_gpu.NVOS32_ALLOC_FLAGS_MEMORY_HANDLE_PROVIDED | nv_gpu.NVOS32_ALLOC_FLAGS_MAP_NOT_REQUIRED), format=6, size=size, alignment=(4<<10), offset=0, limit=size-1) - mem_handle = rm_alloc(self.fd_ctl, nv_gpu.NV1_MEMORY_SYSTEM, self.root, self.device, alloc_params).hObjectNew + mem_handle = rm_alloc(self.fd_ctl, nv_gpu.NV1_MEMORY_SYSTEM, self.root, self.nvdevice, alloc_params).hObjectNew if va_addr is None: va_addr = self._alloc_gpu_vaddr(size, force_low=True) if map_to_cpu: va_addr = self._gpu_map_to_cpu(mem_handle, size, target=va_addr, flags=map_flags, system=True) @@ -372,7 +372,7 @@ class NVDevice(HCQCompiled): NVDevice.host_object_enumerator += 1 flags = ((nv_gpu.NVOS02_FLAGS_PHYSICALITY_NONCONTIGUOUS << 4) | (nv_gpu.NVOS02_FLAGS_COHERENCY_CACHED << 12) | (nv_gpu.NVOS02_FLAGS_MAPPING_NO_MAP << 30)) - made = nv_gpu.nv_ioctl_nvos02_parameters_with_fd(params=nv_gpu.NVOS02_PARAMETERS(hRoot=self.root, hObjectParent=self.device, flags=flags, + made = nv_gpu.nv_ioctl_nvos02_parameters_with_fd(params=nv_gpu.NVOS02_PARAMETERS(hRoot=self.root, hObjectParent=self.nvdevice, flags=flags, hObjectNew=NVDevice.host_object_enumerator, hClass=nv_gpu.NV01_MEMORY_SYSTEM_OS_DESCRIPTOR, pMemory=va_base, limit=aligned_sz-1), fd=-1) nv_iowr(self.fd_dev, nv_gpu.NV_ESC_RM_ALLOC_MEMORY, made) @@ -381,7 +381,8 @@ class NVDevice(HCQCompiled): def _gpu_free(self, mem): if mem.hMemory > NVDevice.host_object_enumerator: # not a host object, clear phys mem. - nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_FREE, made:=nv_gpu.NVOS00_PARAMETERS(hRoot=self.root, hObjectParent=self.device, hObjectOld=mem.hMemory)) + made = nv_gpu.NVOS00_PARAMETERS(hRoot=self.root, hObjectParent=self.nvdevice, hObjectOld=mem.hMemory) + nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_FREE, made) if made.status != 0: raise RuntimeError(f"_gpu_free returned {get_error_str(made.status)}") self._debug_mappings.pop((mem.va_addr, mem.size)) @@ -411,7 +412,7 @@ class NVDevice(HCQCompiled): def _setup_nvclasses(self): classlist = memoryview(bytearray(100 * 4)).cast('I') - clsinfo = rmctrl.gpu_get_classlist(self.fd_ctl, self.root, self.device, numClasses=100, classList=mv_address(classlist)) + clsinfo = rmctrl.gpu_get_classlist(self.fd_ctl, self.root, self.nvdevice, numClasses=100, classList=mv_address(classlist)) self.nvclasses = {classlist[i] for i in range(clsinfo.numClasses)} self.compute_class = next(clss for clss in [nv_gpu.ADA_COMPUTE_A, nv_gpu.AMPERE_COMPUTE_B] if clss in self.nvclasses) @@ -439,8 +440,8 @@ class NVDevice(HCQCompiled): device_params = nv_gpu.NV0080_ALLOC_PARAMETERS(deviceId=self.gpu_info.deviceInstance, hClientShare=self.root, vaMode=nv_gpu.NV_DEVICE_ALLOCATION_VAMODE_MULTIPLE_VASPACES) - self.device = rm_alloc(self.fd_ctl, nv_gpu.NV01_DEVICE_0, self.root, self.root, device_params).hObjectNew - self.subdevice = rm_alloc(self.fd_ctl, nv_gpu.NV20_SUBDEVICE_0, self.root, self.device, None).hObjectNew + self.nvdevice = rm_alloc(self.fd_ctl, nv_gpu.NV01_DEVICE_0, self.root, self.root, device_params).hObjectNew + self.subdevice = rm_alloc(self.fd_ctl, nv_gpu.NV20_SUBDEVICE_0, self.root, self.nvdevice, None).hObjectNew self.usermode = rm_alloc(self.fd_ctl, nv_gpu.TURING_USERMODE_A, self.root, self.subdevice, None).hObjectNew self.gpu_mmio = to_mv(self._gpu_map_to_cpu(self.usermode, mmio_sz:=0x10000, flags=2), mmio_sz).cast("I") @@ -452,7 +453,7 @@ class NVDevice(HCQCompiled): vaspace_params = nv_gpu.NV_VASPACE_ALLOCATION_PARAMETERS(vaBase=0x1000, vaSize=0x1fffffb000000, flags=nv_gpu.NV_VASPACE_ALLOCATION_FLAGS_ENABLE_PAGE_FAULTING | nv_gpu.NV_VASPACE_ALLOCATION_FLAGS_IS_EXTERNALLY_OWNED) - vaspace = rm_alloc(self.fd_ctl, nv_gpu.FERMI_VASPACE_A, self.root, self.device, vaspace_params).hObjectNew + vaspace = rm_alloc(self.fd_ctl, nv_gpu.FERMI_VASPACE_A, self.root, self.nvdevice, vaspace_params).hObjectNew raw_uuid = rmctrl.gpu_get_gid_info(self.fd_ctl, self.root, self.subdevice, flags=nv_gpu.NV2080_GPU_CMD_GPU_GET_GID_FLAGS_FORMAT_BINARY, length=16) self.gpu_uuid = nv_gpu.struct_nv_uuid(uuid=(ctypes.c_ubyte*16)(*[raw_uuid.data[i] for i in range(16)])) @@ -470,7 +471,7 @@ class NVDevice(HCQCompiled): else: self._gpu_map(NVDevice.signals_page) channel_params = nv_gpu.NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS(engineType=nv_gpu.NV2080_ENGINE_TYPE_GRAPHICS) - channel_group = rm_alloc(self.fd_ctl, nv_gpu.KEPLER_CHANNEL_GROUP_A, self.root, self.device, channel_params).hObjectNew + channel_group = rm_alloc(self.fd_ctl, nv_gpu.KEPLER_CHANNEL_GROUP_A, self.root, self.nvdevice, channel_params).hObjectNew gpfifo_area = self._gpu_alloc(0x200000, contig=True, huge_page=True, map_to_cpu=True, map_flags=0x10d0000, tag="gpfifo") @@ -508,7 +509,7 @@ class NVDevice(HCQCompiled): if enable_debug: self.debug_compute_obj, self.debug_channel = comp, gpfifo debugger_params = nv_gpu.NV83DE_ALLOC_PARAMETERS(hAppClient=self.root, hClass3dObject=self.debug_compute_obj) - self.debugger = rm_alloc(self.fd_ctl, nv_gpu.GT200_DEBUGGER, self.root, self.device, debugger_params).hObjectNew + self.debugger = rm_alloc(self.fd_ctl, nv_gpu.GT200_DEBUGGER, self.root, self.nvdevice, debugger_params).hObjectNew ws_token_params = rmctrl.gpfifo_get_work_submit_token(self.fd_ctl, self.root, gpfifo, workSubmitToken=-1) assert ws_token_params.workSubmitToken != -1 diff --git a/tinygrad/runtime/support/hcq.py b/tinygrad/runtime/support/hcq.py index 2d8aa154be..a671a34106 100644 --- a/tinygrad/runtime/support/hcq.py +++ b/tinygrad/runtime/support/hcq.py @@ -451,11 +451,11 @@ class HCQCompiled(Compiled): self.synchronize() for st, en, name, is_cp, args in self.raw_prof_records: - self.profile_logger.events += [(name, self._gpu2cpu_time(st, is_cp), self._gpu2cpu_time(en, is_cp), self.dname, qname[is_cp], args)] + self.profile_logger.events += [(name, self._gpu2cpu_time(st, is_cp), self._gpu2cpu_time(en, is_cp), self.device, qname[is_cp], args)] for a_st, a_en, a_dev, a_is_copy, b_st, b_en, b_dev, b_is_copy in self.dep_prof_records: # Perfetto connects nodes based on timing data, ensuring every choice is valid by averaging times to a midpoint. a_tm, b_tm = a_dev._gpu2cpu_time((a_st+a_en)/decimal.Decimal(2), a_is_copy), b_dev._gpu2cpu_time((b_st+b_en)/decimal.Decimal(2), b_is_copy) - self.profile_logger.deps += [(a_tm, b_tm, a_dev.dname, qname[a_is_copy], b_dev.dname, qname[b_is_copy])] + self.profile_logger.deps += [(a_tm, b_tm, a_dev.device, qname[a_is_copy], b_dev.device, qname[b_is_copy])] self.raw_prof_records, self.dep_prof_records = [], [] # Remove the logger, this flushes all data written by the device. @@ -486,7 +486,7 @@ class HCQAllocator(LRUAllocator, Generic[DeviceType]): # pylint: disable=abstrac def _copyin(self, dest:HCQBuffer, src:memoryview): assert self.dev.hw_copy_queue_t is not None - with hcq_profile(self.dev, queue_type=self.dev.hw_copy_queue_t, desc=f"CPU -> {self.dev.dname}", enabled=PROFILE): + with hcq_profile(self.dev, queue_type=self.dev.hw_copy_queue_t, desc=f"CPU -> {self.dev.device}", enabled=PROFILE): for i in range(0, src.nbytes, self.b[0].size): self.b_next = (self.b_next + 1) % len(self.b) self.dev.timeline_signal.wait(self.b_timeline[self.b_next]) @@ -506,7 +506,7 @@ class HCQAllocator(LRUAllocator, Generic[DeviceType]): # pylint: disable=abstrac return None assert self.dev.hw_copy_queue_t is not None - with hcq_profile(self.dev, queue_type=self.dev.hw_copy_queue_t, desc=f"DISK -> {self.dev.dname}", enabled=PROFILE): + with hcq_profile(self.dev, queue_type=self.dev.hw_copy_queue_t, desc=f"DISK -> {self.dev.device}", enabled=PROFILE): for (batch_info, dst_off, src_off, copy_size) in src.device.allocator._copyout_sharded(src, size, _get_temp_buf, seg_len=self.b[0].size): self.dev.hw_copy_queue_t().wait(self.dev.timeline_signal, self.dev.timeline_value - 1) \ .copy(dest.va_addr + dst_off, batch_info[0] + src_off, copy_size) \ @@ -518,7 +518,7 @@ class HCQAllocator(LRUAllocator, Generic[DeviceType]): # pylint: disable=abstrac self.dev.synchronize() assert self.dev.hw_copy_queue_t is not None - with hcq_profile(self.dev, queue_type=self.dev.hw_copy_queue_t, desc=f"{self.dev.dname} -> CPU", enabled=PROFILE): + with hcq_profile(self.dev, queue_type=self.dev.hw_copy_queue_t, desc=f"{self.dev.device} -> CPU", enabled=PROFILE): for i in range(0, dest.nbytes, self.b[0].size): self.dev.hw_copy_queue_t().wait(self.dev.timeline_signal, self.dev.timeline_value - 1) \ .copy(self.b[0].va_addr, src.va_addr+i, lsize:=min(self.b[0].size, dest.nbytes-i)) \ @@ -532,7 +532,7 @@ class HCQAllocator(LRUAllocator, Generic[DeviceType]): # pylint: disable=abstrac cast(HCQAllocator, src_dev.allocator).map(dest) assert src_dev.hw_copy_queue_t is not None - with hcq_profile(src_dev, queue_type=src_dev.hw_copy_queue_t, desc=f"{src_dev.dname} -> {dest_dev.dname}", enabled=PROFILE): + with hcq_profile(src_dev, queue_type=src_dev.hw_copy_queue_t, desc=f"{src_dev.device} -> {dest_dev.device}", enabled=PROFILE): src_dev.hw_copy_queue_t().wait(src_dev.timeline_signal, src_dev.timeline_value - 1) \ .wait(dest_dev.timeline_signal, dest_dev.timeline_value - 1) \ .copy(dest.va_addr, src.va_addr, sz) \