hcq: dyn alloc signals (#9238)

* hcq: dyn alloc signals

* types and uniqueue devs

* typing

* mypy

* mypy one more time

* test

* make fds to not intersect in mockgpu between drivers
This commit is contained in:
nimlgen
2025-02-25 17:22:24 +03:00
committed by GitHub
parent 6610ad58ab
commit 70db8c3003
6 changed files with 59 additions and 43 deletions

View File

@@ -59,7 +59,7 @@ class NVDriver(VirtDriver):
self.root_handle = None
self.gpus = {}
self.next_fd = (1 << 30)
self.next_fd = (1 << 29)
self.next_handle = 1
self.object_by_handle = {}

View File

@@ -499,5 +499,17 @@ class TestHCQ(unittest.TestCase):
assert "0xDEADBEE1" in str(ctx.exception)
os.environ.pop("MOCKGPU_EMU_FAULTADDR")
def test_multidevice(self):
try: amd_dev = Device["AMD"]
except Exception: self.skipTest("no AMD device, test skipped")
try: nv_dev = Device["NV"]
except Exception: self.skipTest("no NV device, test skipped")
x = amd_dev.signal_t()
y = nv_dev.signal_t()
assert type(x) is amd_dev.signal_t
assert type(y) is nv_dev.signal_t
if __name__ == "__main__":
unittest.main()

View File

@@ -1,5 +1,5 @@
from __future__ import annotations
from typing import Any, cast
from typing import Any, cast, ClassVar
import os, ctypes, ctypes.util, functools, mmap, errno, array, contextlib, sys, select
assert sys.platform != 'win32'
from dataclasses import dataclass
@@ -27,10 +27,7 @@ def nbioreg(reg): return reg + 0x00000d20 # NBIO_BASE__INST0_SEG2
class AMDSignal(HCQSignal):
def __init__(self, base_addr:int|None=None, **kwargs):
super().__init__(AMDDevice.signals_pool.pop() if base_addr is None else base_addr, **kwargs, timestamp_divider=100)
def __del__(self):
if isinstance(self.base_addr, int): AMDDevice.signals_pool.append(self.base_addr)
super().__init__(base_addr, **kwargs, timestamp_divider=100, dev_t=AMDDevice)
def _sleep(self, time_spent_waiting_ms:int):
# Resonable to sleep for long workloads (which take more than 2s) and only timeline signals.
@@ -562,9 +559,11 @@ class PCIIface:
def device_fini(self): self.adev.fini()
class AMDDevice(HCQCompiled):
devices: ClassVar[list[HCQCompiled]] = []
signal_pages: ClassVar[list[Any]] = []
signal_pool: ClassVar[list[int]] = []
driverless:bool = not HWInterface.exists('/sys/module/amdgpu') or bool(getenv("AMD_DRIVERLESS", 0))
signals_page:Any = None
signals_pool:list[int] = []
def __init__(self, device:str=""):
self.device_id = int(device.split(":")[1]) if ":" in device else 0
@@ -573,11 +572,6 @@ class AMDDevice(HCQCompiled):
self.arch = "gfx%d%x%x" % (self.target // 10000, (self.target // 100) % 100, self.target % 100)
if self.target < 100300 or self.target >= 120000: raise RuntimeError(f"Unsupported arch: {self.arch}")
if AMDDevice.signals_page is None:
AMDDevice.signals_page = self.dev_iface.alloc(16 * 65536, host=True, uncached=True, cpu_access=True)
AMDDevice.signals_pool = [AMDDevice.signals_page.va_addr + off for off in range(0, AMDDevice.signals_page.size, 16)]
else: self.dev_iface.map(AMDDevice.signals_page)
self.max_cu_id = self.dev_iface.props['simd_count'] // self.dev_iface.props['simd_per_cu'] - 1
self.max_wave_id = self.dev_iface.props['max_waves_per_simd'] * self.dev_iface.props['simd_per_cu'] - 1
self.has_scratch_base_registers = self.target >= 110000

View File

@@ -1,7 +1,7 @@
from __future__ import annotations
import os, ctypes, contextlib, re, functools, mmap, struct, array, sys
assert sys.platform != 'win32'
from typing import Any, cast, Union, Type
from typing import Any, cast, Union, Type, ClassVar
from dataclasses import dataclass
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQProgram, HCQSignal, BumpAllocator
from tinygrad.runtime.support.hcq import HWInterface, MOCKGPU
@@ -73,10 +73,7 @@ assert ctypes.sizeof(qmd_struct_t) == 0x40 * 4
class NVSignal(HCQSignal):
def __init__(self, base_addr:int|None=None, **kwargs):
super().__init__(NVDevice.signals_pool.pop() if base_addr is None else base_addr, **kwargs, timestamp_divider=1000, value_off=0, timestamp_off=8)
def __del__(self):
if isinstance(self.base_addr, int): NVDevice.signals_pool.append(self.base_addr)
super().__init__(base_addr, **kwargs, timestamp_divider=1000, dev_t=NVDevice)
class NVCommandQueue(HWQueue[NVSignal, 'NVDevice', 'NVProgram', 'NVArgsState']):
def __init__(self):
@@ -285,12 +282,14 @@ class GPFifo:
MAP_FIXED, MAP_NORESERVE = 0x10, 0x400
class NVDevice(HCQCompiled[NVSignal]):
devices: ClassVar[list[HCQCompiled]] = []
signal_pages: ClassVar[list[Any]] = []
signal_pool: ClassVar[list[int]] = []
root = None
fd_ctl: HWInterface
fd_uvm: HWInterface
gpus_info: Union[list, ctypes.Array] = []
signals_page: Any = None
signals_pool: list[int] = []
# TODO: Need a proper allocator for va addresses
# 0x1000000000 - 0x2000000000, reserved for system/cpu mappings
@@ -433,11 +432,6 @@ class NVDevice(HCQCompiled[NVSignal]):
try: uvm.enable_peer_access(self.fd_uvm, gpuUuidA=self.gpu_uuid, gpuUuidB=dev.gpu_uuid)
except RuntimeError as e: raise RuntimeError(str(e) + f". Make sure GPUs #{self.gpu_minor} & #{dev.gpu_minor} have P2P enabled between.") from e
if NVDevice.signals_page is None:
NVDevice.signals_page = self._gpu_alloc(16 * 65536, cpu_access=True, uncached=True)
NVDevice.signals_pool = [self.signals_page.va_addr + off for off in range(0, NVDevice.signals_page.size, 16)]
else: self._gpu_map(NVDevice.signals_page)
channel_params = nv_gpu.NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS(engineType=nv_gpu.NV2080_ENGINE_TYPE_GRAPHICS)
channel_group = rm_alloc(self.fd_ctl, nv_gpu.KEPLER_CHANNEL_GROUP_A, self.root, self.nvdevice, channel_params).hObjectNew

View File

@@ -2,7 +2,7 @@ from __future__ import annotations
import os, ctypes, functools, mmap, struct, array, math, sys
assert sys.platform != 'win32'
from types import SimpleNamespace
from typing import Any, cast
from typing import Any, cast, ClassVar
from tinygrad.device import BufferSpec
from tinygrad.runtime.support.hcq import HCQBuffer, HWQueue, HCQProgram, HCQCompiled, HCQAllocatorBase, HCQSignal, HCQArgsState, BumpAllocator
from tinygrad.runtime.support.hcq import HWInterface
@@ -38,10 +38,7 @@ class QCOMCompiler(CLCompiler):
class QCOMSignal(HCQSignal):
def __init__(self, base_addr:int|None=None, **kwargs):
super().__init__(QCOMDevice.signals_pool.pop() if base_addr is None else base_addr, **kwargs, timestamp_divider=19.2)
def __del__(self):
if isinstance(self.base_addr, int): QCOMDevice.signals_pool.append(self.base_addr)
super().__init__(base_addr, **kwargs, timestamp_divider=19.2, dev_t=QCOMDevice)
def _sleep(self, time_spent_waiting_ms:int):
# Sleep only for only timeline signals. Do it immediately to free cpu.
@@ -320,16 +317,16 @@ class QCOMAllocator(HCQAllocatorBase):
self.dev._gpu_free(opaque)
class QCOMDevice(HCQCompiled):
signals_page: Any = None
signals_pool: list[int] = []
devices: ClassVar[list[HCQCompiled]] = []
signal_pages: ClassVar[list[Any]] = []
signal_pool: ClassVar[list[int]] = []
gpu_id: int = 0
dummy_addr: int = 0
def __init__(self, device:str=""):
self.fd = HWInterface('/dev/kgsl-3d0', os.O_RDWR)
QCOMDevice.dummy_addr = cast(int, self._gpu_alloc(0x1000).va_addr)
QCOMDevice.signals_page = self._gpu_alloc(16 * 65536, uncached=True)
QCOMDevice.signals_pool = [self.signals_page.va_addr + off for off in range(0, self.signals_page.size, 16)]
flags = kgsl.KGSL_CONTEXT_PREAMBLE | kgsl.KGSL_CONTEXT_PWR_CONSTRAINT | kgsl.KGSL_CONTEXT_NO_FAULT_TOLERANCE | kgsl.KGSL_CONTEXT_NO_GMEM_ALLOC \
| kgsl.KGSL_CONTEXT_PRIORITY(8) | kgsl.KGSL_CONTEXT_PREEMPT_STYLE(kgsl.KGSL_CONTEXT_PREEMPT_STYLE_FINEGRAIN)

View File

@@ -1,5 +1,5 @@
from __future__ import annotations
from typing import cast, Type, TypeVar, Generic, Any
from typing import cast, Type, TypeVar, Generic, Any, ClassVar
import contextlib, decimal, statistics, time, ctypes, array, os, fcntl
from tinygrad.helpers import PROFILE, from_mv, getenv, to_mv, round_up
from tinygrad.renderer import Renderer
@@ -203,15 +203,20 @@ class HWQueue(Generic[SignalType, DeviceType, ProgramType, ArgsStateType]):
def _submit(self, dev:DeviceType): raise NotImplementedError("need _submit")
class HCQSignal(Generic[DeviceType]):
def __init__(self, base_addr:sint=0, value:int=0, timeline_for_device:DeviceType|None=None, timestamp_divider=1, value_off=0, timestamp_off=8):
self.base_addr, self.value_addr, self.timestamp_addr = base_addr, base_addr+value_off, base_addr+timestamp_off
def __init__(self, base_addr:sint|None=None, value:int=0, dev_t:Type[DeviceType]|None=None, timeline_for_device:DeviceType|None=None,
timestamp_divider=1, value_off=0, timestamp_off=8):
self.base_addr = dev_t._alloc_signal_addr() if dev_t is not None and base_addr is None else base_addr
self.value_addr, self.timestamp_addr, self.dev_t = self.base_addr+value_off, self.base_addr+timestamp_off, dev_t
self.timestamp_divider:decimal.Decimal = decimal.Decimal(timestamp_divider)
self.timeline_for_device:DeviceType|None = timeline_for_device
if isinstance(base_addr, int):
if isinstance(self.base_addr, int):
self.value_mv, self.timestamp_mv = to_mv(self.value_addr, 8).cast('Q'), to_mv(self.timestamp_addr, 8).cast('Q')
self.value_mv[0] = value
def __del__(self):
if isinstance(self.base_addr, int) and self.dev_t is not None: self.dev_t.signal_pool.append(self.base_addr)
@property
def value(self) -> int: return self.value_mv[0]
@@ -332,23 +337,29 @@ class HCQCompiled(Compiled, Generic[SignalType]):
"""
A base class for devices compatible with the HCQ (Hardware Command Queue) API.
"""
devices: list[HCQCompiled] = []
devices: ClassVar[list[HCQCompiled]] = []
signal_pages: ClassVar[list[Any]] = []
signal_pool: ClassVar[list[int]] = []
def __init__(self, device:str, allocator:HCQAllocatorBase, renderer:Renderer, compiler:Compiler, runtime, signal_t:Type[SignalType],
comp_queue_t:Type[HWQueue], copy_queue_t:Type[HWQueue]|None):
self.device_id:int = int(device.split(":")[1]) if ":" in device else 0
from tinygrad.runtime.graph.hcq import HCQGraph
super().__init__(device, allocator, renderer, compiler, runtime, HCQGraph)
# Map signals if any
for sig_page in self.signal_pages: cast(HCQAllocator, self.allocator).map(sig_page)
self.devices.append(self)
self.signal_t, self.hw_compute_queue_t, self.hw_copy_queue_t = signal_t, comp_queue_t, copy_queue_t
self.timeline_value:int = 1
self.timeline_signal:SignalType = self.signal_t(value=0, timeline_for_device=self)
self._shadow_timeline_signal:SignalType = self.signal_t(value=0, timeline_for_device=self)
self.sig_prof_records:list[tuple[HCQSignal, HCQSignal, str, bool]] = []
from tinygrad.runtime.graph.hcq import HCQGraph
super().__init__(device, allocator, renderer, compiler, runtime, HCQGraph)
self.kernargs_page:HCQBuffer = self.allocator.alloc(16 << 20, BufferSpec(cpu_access=True))
self.kernargs_allocator:BumpAllocator = BumpAllocator(self.kernargs_page.size, base=cast(int, self.kernargs_page.va_addr), wrap=True)
self.devices.append(self)
def synchronize(self):
try: self.timeline_signal.wait(self.timeline_value - 1)
@@ -361,6 +372,14 @@ class HCQCompiled(Compiled, Generic[SignalType]):
Compiled.profile_events += [ProfileRangeEvent(self.device, name, st.timestamp, en.timestamp, cp) for st,en,name,cp in self.sig_prof_records]
self.sig_prof_records = []
@classmethod
def _alloc_signal_addr(cls) -> int:
if not cls.signal_pool:
cls.signal_pages.append(alc:=cls.devices[0].allocator.alloc(0x1000, BufferSpec(host=True, uncached=True, cpu_access=True)))
cls.signal_pool += [alc.va_addr + off for off in range(0, alc.size, 16)]
for dev in cls.devices: cast(HCQAllocator, dev.allocator).map(alc)
return cls.signal_pool.pop()
def _at_profile_finalize(self):
def _sync(d:HCQCompiled, q_t:Type[HWQueue]):
q_t().timestamp(d.timeline_signal).signal(d.timeline_signal, d.timeline_value).submit(d)