mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-06-13 08:28:55 +08:00
hcq: dyn alloc signals (#9238)
* hcq: dyn alloc signals * types and uniqueue devs * typing * mypy * mypy one more time * test * make fds to not intersect in mockgpu between drivers
This commit is contained in:
@@ -59,7 +59,7 @@ class NVDriver(VirtDriver):
|
||||
self.root_handle = None
|
||||
|
||||
self.gpus = {}
|
||||
self.next_fd = (1 << 30)
|
||||
self.next_fd = (1 << 29)
|
||||
self.next_handle = 1
|
||||
|
||||
self.object_by_handle = {}
|
||||
|
||||
@@ -499,5 +499,17 @@ class TestHCQ(unittest.TestCase):
|
||||
assert "0xDEADBEE1" in str(ctx.exception)
|
||||
os.environ.pop("MOCKGPU_EMU_FAULTADDR")
|
||||
|
||||
def test_multidevice(self):
|
||||
try: amd_dev = Device["AMD"]
|
||||
except Exception: self.skipTest("no AMD device, test skipped")
|
||||
|
||||
try: nv_dev = Device["NV"]
|
||||
except Exception: self.skipTest("no NV device, test skipped")
|
||||
|
||||
x = amd_dev.signal_t()
|
||||
y = nv_dev.signal_t()
|
||||
assert type(x) is amd_dev.signal_t
|
||||
assert type(y) is nv_dev.signal_t
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
from __future__ import annotations
|
||||
from typing import Any, cast
|
||||
from typing import Any, cast, ClassVar
|
||||
import os, ctypes, ctypes.util, functools, mmap, errno, array, contextlib, sys, select
|
||||
assert sys.platform != 'win32'
|
||||
from dataclasses import dataclass
|
||||
@@ -27,10 +27,7 @@ def nbioreg(reg): return reg + 0x00000d20 # NBIO_BASE__INST0_SEG2
|
||||
|
||||
class AMDSignal(HCQSignal):
|
||||
def __init__(self, base_addr:int|None=None, **kwargs):
|
||||
super().__init__(AMDDevice.signals_pool.pop() if base_addr is None else base_addr, **kwargs, timestamp_divider=100)
|
||||
|
||||
def __del__(self):
|
||||
if isinstance(self.base_addr, int): AMDDevice.signals_pool.append(self.base_addr)
|
||||
super().__init__(base_addr, **kwargs, timestamp_divider=100, dev_t=AMDDevice)
|
||||
|
||||
def _sleep(self, time_spent_waiting_ms:int):
|
||||
# Resonable to sleep for long workloads (which take more than 2s) and only timeline signals.
|
||||
@@ -562,9 +559,11 @@ class PCIIface:
|
||||
def device_fini(self): self.adev.fini()
|
||||
|
||||
class AMDDevice(HCQCompiled):
|
||||
devices: ClassVar[list[HCQCompiled]] = []
|
||||
signal_pages: ClassVar[list[Any]] = []
|
||||
signal_pool: ClassVar[list[int]] = []
|
||||
|
||||
driverless:bool = not HWInterface.exists('/sys/module/amdgpu') or bool(getenv("AMD_DRIVERLESS", 0))
|
||||
signals_page:Any = None
|
||||
signals_pool:list[int] = []
|
||||
|
||||
def __init__(self, device:str=""):
|
||||
self.device_id = int(device.split(":")[1]) if ":" in device else 0
|
||||
@@ -573,11 +572,6 @@ class AMDDevice(HCQCompiled):
|
||||
self.arch = "gfx%d%x%x" % (self.target // 10000, (self.target // 100) % 100, self.target % 100)
|
||||
if self.target < 100300 or self.target >= 120000: raise RuntimeError(f"Unsupported arch: {self.arch}")
|
||||
|
||||
if AMDDevice.signals_page is None:
|
||||
AMDDevice.signals_page = self.dev_iface.alloc(16 * 65536, host=True, uncached=True, cpu_access=True)
|
||||
AMDDevice.signals_pool = [AMDDevice.signals_page.va_addr + off for off in range(0, AMDDevice.signals_page.size, 16)]
|
||||
else: self.dev_iface.map(AMDDevice.signals_page)
|
||||
|
||||
self.max_cu_id = self.dev_iface.props['simd_count'] // self.dev_iface.props['simd_per_cu'] - 1
|
||||
self.max_wave_id = self.dev_iface.props['max_waves_per_simd'] * self.dev_iface.props['simd_per_cu'] - 1
|
||||
self.has_scratch_base_registers = self.target >= 110000
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
from __future__ import annotations
|
||||
import os, ctypes, contextlib, re, functools, mmap, struct, array, sys
|
||||
assert sys.platform != 'win32'
|
||||
from typing import Any, cast, Union, Type
|
||||
from typing import Any, cast, Union, Type, ClassVar
|
||||
from dataclasses import dataclass
|
||||
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQProgram, HCQSignal, BumpAllocator
|
||||
from tinygrad.runtime.support.hcq import HWInterface, MOCKGPU
|
||||
@@ -73,10 +73,7 @@ assert ctypes.sizeof(qmd_struct_t) == 0x40 * 4
|
||||
|
||||
class NVSignal(HCQSignal):
|
||||
def __init__(self, base_addr:int|None=None, **kwargs):
|
||||
super().__init__(NVDevice.signals_pool.pop() if base_addr is None else base_addr, **kwargs, timestamp_divider=1000, value_off=0, timestamp_off=8)
|
||||
|
||||
def __del__(self):
|
||||
if isinstance(self.base_addr, int): NVDevice.signals_pool.append(self.base_addr)
|
||||
super().__init__(base_addr, **kwargs, timestamp_divider=1000, dev_t=NVDevice)
|
||||
|
||||
class NVCommandQueue(HWQueue[NVSignal, 'NVDevice', 'NVProgram', 'NVArgsState']):
|
||||
def __init__(self):
|
||||
@@ -285,12 +282,14 @@ class GPFifo:
|
||||
|
||||
MAP_FIXED, MAP_NORESERVE = 0x10, 0x400
|
||||
class NVDevice(HCQCompiled[NVSignal]):
|
||||
devices: ClassVar[list[HCQCompiled]] = []
|
||||
signal_pages: ClassVar[list[Any]] = []
|
||||
signal_pool: ClassVar[list[int]] = []
|
||||
|
||||
root = None
|
||||
fd_ctl: HWInterface
|
||||
fd_uvm: HWInterface
|
||||
gpus_info: Union[list, ctypes.Array] = []
|
||||
signals_page: Any = None
|
||||
signals_pool: list[int] = []
|
||||
|
||||
# TODO: Need a proper allocator for va addresses
|
||||
# 0x1000000000 - 0x2000000000, reserved for system/cpu mappings
|
||||
@@ -433,11 +432,6 @@ class NVDevice(HCQCompiled[NVSignal]):
|
||||
try: uvm.enable_peer_access(self.fd_uvm, gpuUuidA=self.gpu_uuid, gpuUuidB=dev.gpu_uuid)
|
||||
except RuntimeError as e: raise RuntimeError(str(e) + f". Make sure GPUs #{self.gpu_minor} & #{dev.gpu_minor} have P2P enabled between.") from e
|
||||
|
||||
if NVDevice.signals_page is None:
|
||||
NVDevice.signals_page = self._gpu_alloc(16 * 65536, cpu_access=True, uncached=True)
|
||||
NVDevice.signals_pool = [self.signals_page.va_addr + off for off in range(0, NVDevice.signals_page.size, 16)]
|
||||
else: self._gpu_map(NVDevice.signals_page)
|
||||
|
||||
channel_params = nv_gpu.NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS(engineType=nv_gpu.NV2080_ENGINE_TYPE_GRAPHICS)
|
||||
channel_group = rm_alloc(self.fd_ctl, nv_gpu.KEPLER_CHANNEL_GROUP_A, self.root, self.nvdevice, channel_params).hObjectNew
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@ from __future__ import annotations
|
||||
import os, ctypes, functools, mmap, struct, array, math, sys
|
||||
assert sys.platform != 'win32'
|
||||
from types import SimpleNamespace
|
||||
from typing import Any, cast
|
||||
from typing import Any, cast, ClassVar
|
||||
from tinygrad.device import BufferSpec
|
||||
from tinygrad.runtime.support.hcq import HCQBuffer, HWQueue, HCQProgram, HCQCompiled, HCQAllocatorBase, HCQSignal, HCQArgsState, BumpAllocator
|
||||
from tinygrad.runtime.support.hcq import HWInterface
|
||||
@@ -38,10 +38,7 @@ class QCOMCompiler(CLCompiler):
|
||||
|
||||
class QCOMSignal(HCQSignal):
|
||||
def __init__(self, base_addr:int|None=None, **kwargs):
|
||||
super().__init__(QCOMDevice.signals_pool.pop() if base_addr is None else base_addr, **kwargs, timestamp_divider=19.2)
|
||||
|
||||
def __del__(self):
|
||||
if isinstance(self.base_addr, int): QCOMDevice.signals_pool.append(self.base_addr)
|
||||
super().__init__(base_addr, **kwargs, timestamp_divider=19.2, dev_t=QCOMDevice)
|
||||
|
||||
def _sleep(self, time_spent_waiting_ms:int):
|
||||
# Sleep only for only timeline signals. Do it immediately to free cpu.
|
||||
@@ -320,16 +317,16 @@ class QCOMAllocator(HCQAllocatorBase):
|
||||
self.dev._gpu_free(opaque)
|
||||
|
||||
class QCOMDevice(HCQCompiled):
|
||||
signals_page: Any = None
|
||||
signals_pool: list[int] = []
|
||||
devices: ClassVar[list[HCQCompiled]] = []
|
||||
signal_pages: ClassVar[list[Any]] = []
|
||||
signal_pool: ClassVar[list[int]] = []
|
||||
|
||||
gpu_id: int = 0
|
||||
dummy_addr: int = 0
|
||||
|
||||
def __init__(self, device:str=""):
|
||||
self.fd = HWInterface('/dev/kgsl-3d0', os.O_RDWR)
|
||||
QCOMDevice.dummy_addr = cast(int, self._gpu_alloc(0x1000).va_addr)
|
||||
QCOMDevice.signals_page = self._gpu_alloc(16 * 65536, uncached=True)
|
||||
QCOMDevice.signals_pool = [self.signals_page.va_addr + off for off in range(0, self.signals_page.size, 16)]
|
||||
|
||||
flags = kgsl.KGSL_CONTEXT_PREAMBLE | kgsl.KGSL_CONTEXT_PWR_CONSTRAINT | kgsl.KGSL_CONTEXT_NO_FAULT_TOLERANCE | kgsl.KGSL_CONTEXT_NO_GMEM_ALLOC \
|
||||
| kgsl.KGSL_CONTEXT_PRIORITY(8) | kgsl.KGSL_CONTEXT_PREEMPT_STYLE(kgsl.KGSL_CONTEXT_PREEMPT_STYLE_FINEGRAIN)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
from __future__ import annotations
|
||||
from typing import cast, Type, TypeVar, Generic, Any
|
||||
from typing import cast, Type, TypeVar, Generic, Any, ClassVar
|
||||
import contextlib, decimal, statistics, time, ctypes, array, os, fcntl
|
||||
from tinygrad.helpers import PROFILE, from_mv, getenv, to_mv, round_up
|
||||
from tinygrad.renderer import Renderer
|
||||
@@ -203,15 +203,20 @@ class HWQueue(Generic[SignalType, DeviceType, ProgramType, ArgsStateType]):
|
||||
def _submit(self, dev:DeviceType): raise NotImplementedError("need _submit")
|
||||
|
||||
class HCQSignal(Generic[DeviceType]):
|
||||
def __init__(self, base_addr:sint=0, value:int=0, timeline_for_device:DeviceType|None=None, timestamp_divider=1, value_off=0, timestamp_off=8):
|
||||
self.base_addr, self.value_addr, self.timestamp_addr = base_addr, base_addr+value_off, base_addr+timestamp_off
|
||||
def __init__(self, base_addr:sint|None=None, value:int=0, dev_t:Type[DeviceType]|None=None, timeline_for_device:DeviceType|None=None,
|
||||
timestamp_divider=1, value_off=0, timestamp_off=8):
|
||||
self.base_addr = dev_t._alloc_signal_addr() if dev_t is not None and base_addr is None else base_addr
|
||||
self.value_addr, self.timestamp_addr, self.dev_t = self.base_addr+value_off, self.base_addr+timestamp_off, dev_t
|
||||
self.timestamp_divider:decimal.Decimal = decimal.Decimal(timestamp_divider)
|
||||
self.timeline_for_device:DeviceType|None = timeline_for_device
|
||||
|
||||
if isinstance(base_addr, int):
|
||||
if isinstance(self.base_addr, int):
|
||||
self.value_mv, self.timestamp_mv = to_mv(self.value_addr, 8).cast('Q'), to_mv(self.timestamp_addr, 8).cast('Q')
|
||||
self.value_mv[0] = value
|
||||
|
||||
def __del__(self):
|
||||
if isinstance(self.base_addr, int) and self.dev_t is not None: self.dev_t.signal_pool.append(self.base_addr)
|
||||
|
||||
@property
|
||||
def value(self) -> int: return self.value_mv[0]
|
||||
|
||||
@@ -332,23 +337,29 @@ class HCQCompiled(Compiled, Generic[SignalType]):
|
||||
"""
|
||||
A base class for devices compatible with the HCQ (Hardware Command Queue) API.
|
||||
"""
|
||||
devices: list[HCQCompiled] = []
|
||||
devices: ClassVar[list[HCQCompiled]] = []
|
||||
signal_pages: ClassVar[list[Any]] = []
|
||||
signal_pool: ClassVar[list[int]] = []
|
||||
|
||||
def __init__(self, device:str, allocator:HCQAllocatorBase, renderer:Renderer, compiler:Compiler, runtime, signal_t:Type[SignalType],
|
||||
comp_queue_t:Type[HWQueue], copy_queue_t:Type[HWQueue]|None):
|
||||
self.device_id:int = int(device.split(":")[1]) if ":" in device else 0
|
||||
|
||||
from tinygrad.runtime.graph.hcq import HCQGraph
|
||||
super().__init__(device, allocator, renderer, compiler, runtime, HCQGraph)
|
||||
|
||||
# Map signals if any
|
||||
for sig_page in self.signal_pages: cast(HCQAllocator, self.allocator).map(sig_page)
|
||||
self.devices.append(self)
|
||||
|
||||
self.signal_t, self.hw_compute_queue_t, self.hw_copy_queue_t = signal_t, comp_queue_t, copy_queue_t
|
||||
self.timeline_value:int = 1
|
||||
self.timeline_signal:SignalType = self.signal_t(value=0, timeline_for_device=self)
|
||||
self._shadow_timeline_signal:SignalType = self.signal_t(value=0, timeline_for_device=self)
|
||||
self.sig_prof_records:list[tuple[HCQSignal, HCQSignal, str, bool]] = []
|
||||
|
||||
from tinygrad.runtime.graph.hcq import HCQGraph
|
||||
super().__init__(device, allocator, renderer, compiler, runtime, HCQGraph)
|
||||
|
||||
self.kernargs_page:HCQBuffer = self.allocator.alloc(16 << 20, BufferSpec(cpu_access=True))
|
||||
self.kernargs_allocator:BumpAllocator = BumpAllocator(self.kernargs_page.size, base=cast(int, self.kernargs_page.va_addr), wrap=True)
|
||||
self.devices.append(self)
|
||||
|
||||
def synchronize(self):
|
||||
try: self.timeline_signal.wait(self.timeline_value - 1)
|
||||
@@ -361,6 +372,14 @@ class HCQCompiled(Compiled, Generic[SignalType]):
|
||||
Compiled.profile_events += [ProfileRangeEvent(self.device, name, st.timestamp, en.timestamp, cp) for st,en,name,cp in self.sig_prof_records]
|
||||
self.sig_prof_records = []
|
||||
|
||||
@classmethod
|
||||
def _alloc_signal_addr(cls) -> int:
|
||||
if not cls.signal_pool:
|
||||
cls.signal_pages.append(alc:=cls.devices[0].allocator.alloc(0x1000, BufferSpec(host=True, uncached=True, cpu_access=True)))
|
||||
cls.signal_pool += [alc.va_addr + off for off in range(0, alc.size, 16)]
|
||||
for dev in cls.devices: cast(HCQAllocator, dev.allocator).map(alc)
|
||||
return cls.signal_pool.pop()
|
||||
|
||||
def _at_profile_finalize(self):
|
||||
def _sync(d:HCQCompiled, q_t:Type[HWQueue]):
|
||||
q_t().timestamp(d.timeline_signal).signal(d.timeline_signal, d.timeline_value).submit(d)
|
||||
|
||||
Reference in New Issue
Block a user