From 70db8c3003e179f82ff208fd32aac8401e4e974e Mon Sep 17 00:00:00 2001 From: nimlgen <138685161+nimlgen@users.noreply.github.com> Date: Tue, 25 Feb 2025 17:22:24 +0300 Subject: [PATCH] hcq: dyn alloc signals (#9238) * hcq: dyn alloc signals * types and uniqueue devs * typing * mypy * mypy one more time * test * make fds to not intersect in mockgpu between drivers --- test/mockgpu/nv/nvdriver.py | 2 +- test/test_hcq.py | 12 +++++++++++ tinygrad/runtime/ops_amd.py | 18 ++++++---------- tinygrad/runtime/ops_nv.py | 18 ++++++---------- tinygrad/runtime/ops_qcom.py | 15 ++++++------- tinygrad/runtime/support/hcq.py | 37 +++++++++++++++++++++++++-------- 6 files changed, 59 insertions(+), 43 deletions(-) diff --git a/test/mockgpu/nv/nvdriver.py b/test/mockgpu/nv/nvdriver.py index ed3e221258..d7c330c600 100644 --- a/test/mockgpu/nv/nvdriver.py +++ b/test/mockgpu/nv/nvdriver.py @@ -59,7 +59,7 @@ class NVDriver(VirtDriver): self.root_handle = None self.gpus = {} - self.next_fd = (1 << 30) + self.next_fd = (1 << 29) self.next_handle = 1 self.object_by_handle = {} diff --git a/test/test_hcq.py b/test/test_hcq.py index 13ef71bd87..42a432000c 100644 --- a/test/test_hcq.py +++ b/test/test_hcq.py @@ -499,5 +499,17 @@ class TestHCQ(unittest.TestCase): assert "0xDEADBEE1" in str(ctx.exception) os.environ.pop("MOCKGPU_EMU_FAULTADDR") + def test_multidevice(self): + try: amd_dev = Device["AMD"] + except Exception: self.skipTest("no AMD device, test skipped") + + try: nv_dev = Device["NV"] + except Exception: self.skipTest("no NV device, test skipped") + + x = amd_dev.signal_t() + y = nv_dev.signal_t() + assert type(x) is amd_dev.signal_t + assert type(y) is nv_dev.signal_t + if __name__ == "__main__": unittest.main() diff --git a/tinygrad/runtime/ops_amd.py b/tinygrad/runtime/ops_amd.py index ba21fa7603..2818e4206c 100644 --- a/tinygrad/runtime/ops_amd.py +++ b/tinygrad/runtime/ops_amd.py @@ -1,5 +1,5 @@ from __future__ import annotations -from typing import Any, cast +from typing import Any, cast, ClassVar import os, ctypes, ctypes.util, functools, mmap, errno, array, contextlib, sys, select assert sys.platform != 'win32' from dataclasses import dataclass @@ -27,10 +27,7 @@ def nbioreg(reg): return reg + 0x00000d20 # NBIO_BASE__INST0_SEG2 class AMDSignal(HCQSignal): def __init__(self, base_addr:int|None=None, **kwargs): - super().__init__(AMDDevice.signals_pool.pop() if base_addr is None else base_addr, **kwargs, timestamp_divider=100) - - def __del__(self): - if isinstance(self.base_addr, int): AMDDevice.signals_pool.append(self.base_addr) + super().__init__(base_addr, **kwargs, timestamp_divider=100, dev_t=AMDDevice) def _sleep(self, time_spent_waiting_ms:int): # Resonable to sleep for long workloads (which take more than 2s) and only timeline signals. @@ -562,9 +559,11 @@ class PCIIface: def device_fini(self): self.adev.fini() class AMDDevice(HCQCompiled): + devices: ClassVar[list[HCQCompiled]] = [] + signal_pages: ClassVar[list[Any]] = [] + signal_pool: ClassVar[list[int]] = [] + driverless:bool = not HWInterface.exists('/sys/module/amdgpu') or bool(getenv("AMD_DRIVERLESS", 0)) - signals_page:Any = None - signals_pool:list[int] = [] def __init__(self, device:str=""): self.device_id = int(device.split(":")[1]) if ":" in device else 0 @@ -573,11 +572,6 @@ class AMDDevice(HCQCompiled): self.arch = "gfx%d%x%x" % (self.target // 10000, (self.target // 100) % 100, self.target % 100) if self.target < 100300 or self.target >= 120000: raise RuntimeError(f"Unsupported arch: {self.arch}") - if AMDDevice.signals_page is None: - AMDDevice.signals_page = self.dev_iface.alloc(16 * 65536, host=True, uncached=True, cpu_access=True) - AMDDevice.signals_pool = [AMDDevice.signals_page.va_addr + off for off in range(0, AMDDevice.signals_page.size, 16)] - else: self.dev_iface.map(AMDDevice.signals_page) - self.max_cu_id = self.dev_iface.props['simd_count'] // self.dev_iface.props['simd_per_cu'] - 1 self.max_wave_id = self.dev_iface.props['max_waves_per_simd'] * self.dev_iface.props['simd_per_cu'] - 1 self.has_scratch_base_registers = self.target >= 110000 diff --git a/tinygrad/runtime/ops_nv.py b/tinygrad/runtime/ops_nv.py index d42b6b3ab3..9eed9a1c04 100644 --- a/tinygrad/runtime/ops_nv.py +++ b/tinygrad/runtime/ops_nv.py @@ -1,7 +1,7 @@ from __future__ import annotations import os, ctypes, contextlib, re, functools, mmap, struct, array, sys assert sys.platform != 'win32' -from typing import Any, cast, Union, Type +from typing import Any, cast, Union, Type, ClassVar from dataclasses import dataclass from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQProgram, HCQSignal, BumpAllocator from tinygrad.runtime.support.hcq import HWInterface, MOCKGPU @@ -73,10 +73,7 @@ assert ctypes.sizeof(qmd_struct_t) == 0x40 * 4 class NVSignal(HCQSignal): def __init__(self, base_addr:int|None=None, **kwargs): - super().__init__(NVDevice.signals_pool.pop() if base_addr is None else base_addr, **kwargs, timestamp_divider=1000, value_off=0, timestamp_off=8) - - def __del__(self): - if isinstance(self.base_addr, int): NVDevice.signals_pool.append(self.base_addr) + super().__init__(base_addr, **kwargs, timestamp_divider=1000, dev_t=NVDevice) class NVCommandQueue(HWQueue[NVSignal, 'NVDevice', 'NVProgram', 'NVArgsState']): def __init__(self): @@ -285,12 +282,14 @@ class GPFifo: MAP_FIXED, MAP_NORESERVE = 0x10, 0x400 class NVDevice(HCQCompiled[NVSignal]): + devices: ClassVar[list[HCQCompiled]] = [] + signal_pages: ClassVar[list[Any]] = [] + signal_pool: ClassVar[list[int]] = [] + root = None fd_ctl: HWInterface fd_uvm: HWInterface gpus_info: Union[list, ctypes.Array] = [] - signals_page: Any = None - signals_pool: list[int] = [] # TODO: Need a proper allocator for va addresses # 0x1000000000 - 0x2000000000, reserved for system/cpu mappings @@ -433,11 +432,6 @@ class NVDevice(HCQCompiled[NVSignal]): try: uvm.enable_peer_access(self.fd_uvm, gpuUuidA=self.gpu_uuid, gpuUuidB=dev.gpu_uuid) except RuntimeError as e: raise RuntimeError(str(e) + f". Make sure GPUs #{self.gpu_minor} & #{dev.gpu_minor} have P2P enabled between.") from e - if NVDevice.signals_page is None: - NVDevice.signals_page = self._gpu_alloc(16 * 65536, cpu_access=True, uncached=True) - NVDevice.signals_pool = [self.signals_page.va_addr + off for off in range(0, NVDevice.signals_page.size, 16)] - else: self._gpu_map(NVDevice.signals_page) - channel_params = nv_gpu.NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS(engineType=nv_gpu.NV2080_ENGINE_TYPE_GRAPHICS) channel_group = rm_alloc(self.fd_ctl, nv_gpu.KEPLER_CHANNEL_GROUP_A, self.root, self.nvdevice, channel_params).hObjectNew diff --git a/tinygrad/runtime/ops_qcom.py b/tinygrad/runtime/ops_qcom.py index e0ce3bdb7b..90982eb71c 100644 --- a/tinygrad/runtime/ops_qcom.py +++ b/tinygrad/runtime/ops_qcom.py @@ -2,7 +2,7 @@ from __future__ import annotations import os, ctypes, functools, mmap, struct, array, math, sys assert sys.platform != 'win32' from types import SimpleNamespace -from typing import Any, cast +from typing import Any, cast, ClassVar from tinygrad.device import BufferSpec from tinygrad.runtime.support.hcq import HCQBuffer, HWQueue, HCQProgram, HCQCompiled, HCQAllocatorBase, HCQSignal, HCQArgsState, BumpAllocator from tinygrad.runtime.support.hcq import HWInterface @@ -38,10 +38,7 @@ class QCOMCompiler(CLCompiler): class QCOMSignal(HCQSignal): def __init__(self, base_addr:int|None=None, **kwargs): - super().__init__(QCOMDevice.signals_pool.pop() if base_addr is None else base_addr, **kwargs, timestamp_divider=19.2) - - def __del__(self): - if isinstance(self.base_addr, int): QCOMDevice.signals_pool.append(self.base_addr) + super().__init__(base_addr, **kwargs, timestamp_divider=19.2, dev_t=QCOMDevice) def _sleep(self, time_spent_waiting_ms:int): # Sleep only for only timeline signals. Do it immediately to free cpu. @@ -320,16 +317,16 @@ class QCOMAllocator(HCQAllocatorBase): self.dev._gpu_free(opaque) class QCOMDevice(HCQCompiled): - signals_page: Any = None - signals_pool: list[int] = [] + devices: ClassVar[list[HCQCompiled]] = [] + signal_pages: ClassVar[list[Any]] = [] + signal_pool: ClassVar[list[int]] = [] + gpu_id: int = 0 dummy_addr: int = 0 def __init__(self, device:str=""): self.fd = HWInterface('/dev/kgsl-3d0', os.O_RDWR) QCOMDevice.dummy_addr = cast(int, self._gpu_alloc(0x1000).va_addr) - QCOMDevice.signals_page = self._gpu_alloc(16 * 65536, uncached=True) - QCOMDevice.signals_pool = [self.signals_page.va_addr + off for off in range(0, self.signals_page.size, 16)] flags = kgsl.KGSL_CONTEXT_PREAMBLE | kgsl.KGSL_CONTEXT_PWR_CONSTRAINT | kgsl.KGSL_CONTEXT_NO_FAULT_TOLERANCE | kgsl.KGSL_CONTEXT_NO_GMEM_ALLOC \ | kgsl.KGSL_CONTEXT_PRIORITY(8) | kgsl.KGSL_CONTEXT_PREEMPT_STYLE(kgsl.KGSL_CONTEXT_PREEMPT_STYLE_FINEGRAIN) diff --git a/tinygrad/runtime/support/hcq.py b/tinygrad/runtime/support/hcq.py index 9c41f6fdc7..9b2134e3d6 100644 --- a/tinygrad/runtime/support/hcq.py +++ b/tinygrad/runtime/support/hcq.py @@ -1,5 +1,5 @@ from __future__ import annotations -from typing import cast, Type, TypeVar, Generic, Any +from typing import cast, Type, TypeVar, Generic, Any, ClassVar import contextlib, decimal, statistics, time, ctypes, array, os, fcntl from tinygrad.helpers import PROFILE, from_mv, getenv, to_mv, round_up from tinygrad.renderer import Renderer @@ -203,15 +203,20 @@ class HWQueue(Generic[SignalType, DeviceType, ProgramType, ArgsStateType]): def _submit(self, dev:DeviceType): raise NotImplementedError("need _submit") class HCQSignal(Generic[DeviceType]): - def __init__(self, base_addr:sint=0, value:int=0, timeline_for_device:DeviceType|None=None, timestamp_divider=1, value_off=0, timestamp_off=8): - self.base_addr, self.value_addr, self.timestamp_addr = base_addr, base_addr+value_off, base_addr+timestamp_off + def __init__(self, base_addr:sint|None=None, value:int=0, dev_t:Type[DeviceType]|None=None, timeline_for_device:DeviceType|None=None, + timestamp_divider=1, value_off=0, timestamp_off=8): + self.base_addr = dev_t._alloc_signal_addr() if dev_t is not None and base_addr is None else base_addr + self.value_addr, self.timestamp_addr, self.dev_t = self.base_addr+value_off, self.base_addr+timestamp_off, dev_t self.timestamp_divider:decimal.Decimal = decimal.Decimal(timestamp_divider) self.timeline_for_device:DeviceType|None = timeline_for_device - if isinstance(base_addr, int): + if isinstance(self.base_addr, int): self.value_mv, self.timestamp_mv = to_mv(self.value_addr, 8).cast('Q'), to_mv(self.timestamp_addr, 8).cast('Q') self.value_mv[0] = value + def __del__(self): + if isinstance(self.base_addr, int) and self.dev_t is not None: self.dev_t.signal_pool.append(self.base_addr) + @property def value(self) -> int: return self.value_mv[0] @@ -332,23 +337,29 @@ class HCQCompiled(Compiled, Generic[SignalType]): """ A base class for devices compatible with the HCQ (Hardware Command Queue) API. """ - devices: list[HCQCompiled] = [] + devices: ClassVar[list[HCQCompiled]] = [] + signal_pages: ClassVar[list[Any]] = [] + signal_pool: ClassVar[list[int]] = [] def __init__(self, device:str, allocator:HCQAllocatorBase, renderer:Renderer, compiler:Compiler, runtime, signal_t:Type[SignalType], comp_queue_t:Type[HWQueue], copy_queue_t:Type[HWQueue]|None): self.device_id:int = int(device.split(":")[1]) if ":" in device else 0 + + from tinygrad.runtime.graph.hcq import HCQGraph + super().__init__(device, allocator, renderer, compiler, runtime, HCQGraph) + + # Map signals if any + for sig_page in self.signal_pages: cast(HCQAllocator, self.allocator).map(sig_page) + self.devices.append(self) + self.signal_t, self.hw_compute_queue_t, self.hw_copy_queue_t = signal_t, comp_queue_t, copy_queue_t self.timeline_value:int = 1 self.timeline_signal:SignalType = self.signal_t(value=0, timeline_for_device=self) self._shadow_timeline_signal:SignalType = self.signal_t(value=0, timeline_for_device=self) self.sig_prof_records:list[tuple[HCQSignal, HCQSignal, str, bool]] = [] - from tinygrad.runtime.graph.hcq import HCQGraph - super().__init__(device, allocator, renderer, compiler, runtime, HCQGraph) - self.kernargs_page:HCQBuffer = self.allocator.alloc(16 << 20, BufferSpec(cpu_access=True)) self.kernargs_allocator:BumpAllocator = BumpAllocator(self.kernargs_page.size, base=cast(int, self.kernargs_page.va_addr), wrap=True) - self.devices.append(self) def synchronize(self): try: self.timeline_signal.wait(self.timeline_value - 1) @@ -361,6 +372,14 @@ class HCQCompiled(Compiled, Generic[SignalType]): Compiled.profile_events += [ProfileRangeEvent(self.device, name, st.timestamp, en.timestamp, cp) for st,en,name,cp in self.sig_prof_records] self.sig_prof_records = [] + @classmethod + def _alloc_signal_addr(cls) -> int: + if not cls.signal_pool: + cls.signal_pages.append(alc:=cls.devices[0].allocator.alloc(0x1000, BufferSpec(host=True, uncached=True, cpu_access=True))) + cls.signal_pool += [alc.va_addr + off for off in range(0, alc.size, 16)] + for dev in cls.devices: cast(HCQAllocator, dev.allocator).map(alc) + return cls.signal_pool.pop() + def _at_profile_finalize(self): def _sync(d:HCQCompiled, q_t:Type[HWQueue]): q_t().timestamp(d.timeline_signal).signal(d.timeline_signal, d.timeline_value).submit(d)