diff --git a/docs/runtime.md b/docs/runtime.md index b38824f336..045ca91ce6 100644 --- a/docs/runtime.md +++ b/docs/runtime.md @@ -5,7 +5,7 @@ tinygrad supports various runtimes, enabling your code to scale across a wide ra | Runtime | Description | Requirements | |---------|-------------|--------------| | [NV](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_nv.py) | Provides acceleration for NVIDIA GPUs | Ampere/Ada series GPUs | -| [AMD](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_amd.py) | Provides acceleration for AMD GPUs | RDNA2/RDNA3/RDNA4 series GPUs | +| [AMD](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_amd.py) | Provides acceleration for AMD GPUs | RDNA2/RDNA3/RDNA4 series GPUs. You can select one of the interfaces for communication by setting `AMD_IFACE=(KFD|PCI)`. See [AMD interfaces](#amd-interfaces) for more details. | | [QCOM](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_qcom.py) | Provides acceleration for QCOM GPUs | 6xx series GPUs | | [METAL](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_metal.py) | Utilizes Metal for acceleration on Apple devices | M1+ Macs; Metal 3.0+ for `bfloat` support | | [CUDA](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_cuda.py) | Utilizes CUDA for acceleration on NVIDIA GPUs | NVIDIA GPU with CUDA support | @@ -64,3 +64,11 @@ rawbuf_ptr = to_mv(cl_buf_desc_ptr, 0x100).cast('Q')[20] # offset 0xA0 is a raw # create tiny tensor tiny = Tensor.from_blob(rawbuf_ptr, (h*w*4,), dtype=dtypes.imagef((h,w)), device='QCOM') ``` + +## AMD Interfaces +AMD backend supports several interfaces for communicating with devices: + +* `KFD`: uses the amdgpu driver +* `PCI`: uses the [AM driver](developer/am.md) + +You can force an interface by setting `AMD_IFACE` to one of these values. In the case of `AMD_IFACE=PCI`, this may unbind your GPU from the amdgpu driver. diff --git a/tinygrad/runtime/ops_amd.py b/tinygrad/runtime/ops_amd.py index 3c9f5bedf6..cc162453b9 100644 --- a/tinygrad/runtime/ops_amd.py +++ b/tinygrad/runtime/ops_amd.py @@ -297,7 +297,7 @@ class AMDComputeQueue(HWQueue): self.release_mem(signal.value_addr, value, self.pm4.data_sel__mec_release_mem__send_32_bit_low, self.pm4.int_sel__mec_release_mem__send_interrupt_after_write_confirm, cache_flush=True) - if not AMDDevice.driverless and (dev:=signal.timeline_for_device) is not None: + if (dev:=signal.timeline_for_device) is not None and not dev.is_am(): self.release_mem(dev.queue_event_mailbox_ptr, dev.queue_event.event_id, self.pm4.data_sel__mec_release_mem__send_32_bit_low, self.pm4.int_sel__mec_release_mem__send_interrupt_after_write_confirm, ctxid=dev.queue_event.event_id) return self @@ -353,10 +353,10 @@ class AMDCopyQueue(HWQueue): fence_flags = self.sdma.SDMA_PKT_FENCE_HEADER_MTYPE(3) if self.dev.target >= (10,0,0) else 0 self.q(self.sdma.SDMA_OP_FENCE | fence_flags, *data64_le(signal.value_addr), value) - if not AMDDevice.driverless and (dev:=signal.timeline_for_device) is not None: + if (dev:=signal.timeline_for_device) is not None and not dev.is_am(): self.q(self.sdma.SDMA_OP_FENCE | fence_flags, *data64_le(dev.queue_event_mailbox_ptr), dev.queue_event.event_id) self.q(self.sdma.SDMA_OP_TRAP, self.sdma.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(dev.queue_event.event_id)) - elif AMDDevice.driverless: self.q(self.sdma.SDMA_OP_TRAP, self.sdma.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(0)) + elif dev is not None and dev.is_am(): self.q(self.sdma.SDMA_OP_TRAP, self.sdma.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(0)) return self @@ -372,7 +372,7 @@ class AMDCopyQueue(HWQueue): return self def bind(self, dev:AMDDevice): - if not getenv("AMD_SDMA_BIND", 0) or not dev.driverless: return + if not getenv("AMD_SDMA_BIND", 0) or not dev.is_am(): return self.binded_device = dev self.hw_page = dev.allocator.alloc((qsz:=round_up(len(self._q), 8)) * 4, BufferSpec(cpu_access=True, nolru=True, uncached=True)) @@ -496,7 +496,7 @@ class AMDQueueDesc: if CPUProgram.atomic_lib is not None: CPUProgram.atomic_lib.atomic_thread_fence(__ATOMIC_SEQ_CST:=5) # Flush hdp if queue is in dev mem. - if dev.driverless and getenv("AMD_ALLOC_QUEUE_DEV_MEM", 1): dev.dev_iface.adev.gmc.flush_hdp() + if dev.is_am() and getenv("AMD_ALLOC_QUEUE_DEV_MEM", 1): dev.dev_iface.adev.gmc.flush_hdp() for doorbell in self.doorbells: doorbell[0] = self.put_value @dataclass(frozen=True) @@ -798,11 +798,18 @@ class AMDDevice(HCQCompiled): signal_pages: ClassVar[list[HCQBuffer]] = [] signal_pool: ClassVar[list[HCQBuffer]] = [] - driverless:bool = not FileIOInterface.exists('/sys/module/amdgpu') or bool(getenv("AMD_DRIVERLESS", 0)) + def is_am(self) -> bool: return isinstance(self.dev_iface, PCIIface) + + def _select_iface(self): + errs:str = "" + for iface_t in (KFDIface, PCIIface) if len(nm:=getenv("AMD_IFACE", "")) == 0 else (getattr(sys.modules[__name__], f"{nm}Iface"),): + try: return iface_t(self, self.device_id) + except Exception as e: errs += f"\n{iface_t.__name__}: {type(e).__name__}: {e}" + raise RuntimeError(f"Cannot find a usable interface for AMD:{self.device_id}:{errs}") def __init__(self, device:str=""): self.device_id = int(device.split(":")[1]) if ":" in device else 0 - self.dev_iface = PCIIface(self, self.device_id) if AMDDevice.driverless else KFDIface(self, self.device_id) + self.dev_iface = self._select_iface() self.target:tuple[int, ...] = ((trgt:=self.dev_iface.props['gfx_target_version']) // 10000, (trgt // 100) % 100, trgt % 100) self.arch = "gfx%d%x%x" % self.target if self.target < (9,4,2) or self.target >= (13,0,0): raise RuntimeError(f"Unsupported arch: {self.arch}") @@ -859,8 +866,8 @@ class AMDDevice(HCQCompiled): self.sqtt_enabled = PROFILE and bool(getenv("SQTT", 0)) if self.sqtt_enabled: if self.arch != 'gfx1100': raise RuntimeError('SQ Thread Tracing is only supported on 7900XTX') - if not self.driverless and (ppfeaturemask:=int(FileIOInterface('/sys/module/amdgpu/parameters/ppfeaturemask', os.O_RDONLY).read(), 16))&0x8000: - raise RuntimeError("SQTT can't be enabled because of hardware bug, to workaround either use driverless or add " + if not self.is_am() and (ppfeaturemask:=int(FileIOInterface('/sys/module/amdgpu/parameters/ppfeaturemask', os.O_RDONLY).read(), 16))&0x8000: + raise RuntimeError("SQTT can't be enabled because of hardware bug, to workaround either use AMD_IFACE=PCI or add " f"ppfeaturemask={(ppfeaturemask&~0x8000):#x} (current {ppfeaturemask=:#x} & ~PP_GFXOFF_MASK) to amdgpu module parameters\n" "For more information read https://github.com/tinygrad/tinygrad/blob/master/extra/sqtt/README.md") SQTT_BUFFER_SIZE = getenv("SQTT_BUFFER_SIZE", 256) # in mb, per shader engine