diff --git a/tinygrad/runtime/ops_amd.py b/tinygrad/runtime/ops_amd.py index ed0f5147a2..68bda85585 100644 --- a/tinygrad/runtime/ops_amd.py +++ b/tinygrad/runtime/ops_amd.py @@ -16,7 +16,7 @@ from tinygrad.runtime.autogen.am import am from tinygrad.runtime.support.elf import elf_loader from tinygrad.runtime.support.am.amdev import AMDev, AMMemoryManager from tinygrad.runtime.support.amd import AMDReg, AMDIP, import_module, import_soc, import_ip_offsets, import_pmc -from tinygrad.runtime.support.system import System, PCIIfaceBase, PCIAllocationMeta, USBPCIDevice, MAP_FIXED, MAP_NORESERVE +from tinygrad.runtime.support.system import System, PCIIfaceBase, PCIAllocationMeta, USBPCIDevice, USB2PCIDevice, MAP_FIXED, MAP_NORESERVE from tinygrad.runtime.support.memory import AddrSpace if getenv("IOCTL"): import extra.hip_gpu_driver.hip_ioctl # noqa: F401 # pylint: disable=unused-import @@ -914,13 +914,48 @@ class USBIface(PCIIface): def sleep(self, timeout): pass +class USB2Iface(PCIIface): + def __init__(self, dev, dev_id): # pylint: disable=super-init-not-called + self.dev, self.pci_dev, self.vram_bar = dev, USB2PCIDevice(dev.__class__.__name__[:2], f"usb2:{dev_id}"), 0 + self.dev_impl = AMDev(self.pci_dev) + self._compute_props() + self.pci_dev.usb2._pci_cacheable += [self.pci_dev.bar_info(2)] # doorbell region is cacheable + + # Copy buffers in VRAM with PCIe bar views (no SRAM staging — we write directly via streaming bulk) + self.copy_bufs = [self._vram_copy_buf(0x80000)] + # sys_buf for small host-side allocations (ring pointers, signals, etc.) — use XDATA-accessible region + self.sys_buf, self.sys_next_off = self._dma_region(ctrl_addr=0xa000, sys_addr=0x820000, size=0x1000), 0x800 + + def _vram_copy_buf(self, size): + mapping = self.dev_impl.mm.valloc(size, uncached=True) + paddr = mapping.paddrs[0][0] + barview = self.pci_dev.map_bar(bar=self.vram_bar, off=paddr, size=size) + return HCQBuffer(mapping.va_addr, size, meta=PCIAllocationMeta(mapping, has_cpu_mapping=False), view=barview, owner=self.dev) + + def _dma_region(self, ctrl_addr, sys_addr, size): + region = self.dev_impl.mm.map_range(vaddr:=self.dev_impl.mm.alloc_vaddr(size=size), size, [(sys_addr, size)], aspace=AddrSpace.SYS, uncached=True) + return HCQBuffer(vaddr, size, meta=PCIAllocationMeta(region, has_cpu_mapping=False), view=self.pci_dev.dma_view(ctrl_addr, size), owner=self.dev) + + def alloc(self, size:int, host=False, uncached=False, cpu_access=False, contiguous=False, force_devmem=False, **kwargs) -> HCQBuffer: + if (host or (uncached and cpu_access)) and self.sys_next_off + size < self.sys_buf.size: + self.sys_next_off += size + return self.sys_buf.offset(self.sys_next_off - size, size) + return super().alloc(size, host=host, uncached=uncached, cpu_access=cpu_access, contiguous=contiguous, force_devmem=True, **kwargs) + + def create_queue(self, queue_type, ring, gart, rptr, wptr, eop_buffer=None, cwsr_buffer=None, ctl_stack_size=0, ctx_save_restore_size=0, + xcc_id=0, idx=0): + if queue_type == kfd.KFD_IOC_QUEUE_TYPE_COMPUTE: self.pci_dev.usb2._pci_cacheable += [(ring.cpu_view().addr, ring.size)] + return super().create_queue(queue_type, ring, gart, rptr, wptr, eop_buffer, cwsr_buffer, ctl_stack_size, ctx_save_restore_size, xcc_id, idx) + + def sleep(self, timeout): pass + class AMDDevice(HCQCompiled): - def is_am(self) -> bool: return isinstance(self.iface, (PCIIface, USBIface)) - def is_usb(self) -> bool: return isinstance(self.iface, USBIface) + def is_am(self) -> bool: return isinstance(self.iface, (PCIIface, USBIface, USB2Iface)) + def is_usb(self) -> bool: return isinstance(self.iface, (USBIface, USB2Iface)) def __init__(self, device:str=""): self.device_id = int(device.split(":")[1]) if ":" in device else 0 - self.iface = self._select_iface(KFDIface, PCIIface, USBIface) + self.iface = self._select_iface(KFDIface, PCIIface, USBIface, USB2Iface) self.target:tuple[int, ...] = ((trgt:=self.iface.props['gfx_target_version']) // 10000, (trgt // 100) % 100, trgt % 100) self.arch = "gfx%d%x%x" % self.target if self.target < (9,4,2) or self.target >= (13,0,0): raise RuntimeError(f"Unsupported arch: {self.arch}") diff --git a/tinygrad/runtime/support/system.py b/tinygrad/runtime/support/system.py index 44922f0ace..168a2102f3 100644 --- a/tinygrad/runtime/support/system.py +++ b/tinygrad/runtime/support/system.py @@ -4,7 +4,7 @@ from tinygrad.helpers import round_up, getenv, OSX, temp, ceildiv, unwrap, fetch from tinygrad.runtime.autogen import libc, pci, vfio, iokit, corefoundation from tinygrad.runtime.support.hcq import FileIOInterface, MMIOInterface, HCQBuffer, hcq_filter_visible_devices from tinygrad.runtime.support.memory import VirtMapping, AddrSpace, BumpAllocator -from tinygrad.runtime.support.usb import ASM24Controller, USBMMIOInterface +from tinygrad.runtime.support.usb import ASM24Controller, USBMMIOInterface, USB2Controller, USB2MMIOInterface MAP_FIXED, MAP_FIXED_NOREPLACE = 0x10, 0x100000 MAP_LOCKED, MAP_POPULATE, MAP_NORESERVE = 0 if OSX else 0x2000, getattr(mmap, "MAP_POPULATE", 0 if OSX else 0x008000), 0x400 @@ -132,6 +132,9 @@ class _System: usb.pcie_cfg_req(pci.PCI_COMMAND, bus=gpu_bus, dev=0, fn=0, value=pci.PCI_COMMAND_IO | pci.PCI_COMMAND_MEMORY | pci.PCI_COMMAND_MASTER, size=1) return bars + def pci_setup_usb2_bars(self, usb:USB2Controller, gpu_bus:int, mem_base:int, pref_mem_base:int) -> dict[int, tuple[int, int]]: + return self.pci_setup_usb_bars(usb, gpu_bus, mem_base, pref_mem_base) # type: ignore # USB2Controller has same pcie_cfg_req interface + def flock_acquire(self, name:str) -> int: import fcntl # to support windows @@ -229,6 +232,25 @@ class USBPCIDevice(PCIDevice): return USBMMIOInterface(self.usb, self.bar_info(bar)[0] + off, size or self.bar_info(bar)[1], fmt) def resize_bar(self, bar_idx:int): pass # already resized +class USB2PCIDevice(PCIDevice): + def __init__(self, devpref:str, pcibus:str): + self.lock_fd = System.flock_acquire(f"{devpref.lower()}_{pcibus.lower()}.lock") + self.usb2 = USB2Controller() + self.pcibus, self._bar_info = pcibus, System.pci_setup_usb2_bars(self.usb2, gpu_bus=4, mem_base=0x10000000, pref_mem_base=(32 << 30)) + self.sram = BumpAllocator(size=0x80000, wrap=False) + + def dma_view(self, ctrl_addr, size): return USB2MMIOInterface(self.usb2, ctrl_addr, size, fmt='B', pcimem=False) + def alloc_sysmem(self, size:int, vaddr:int=0, contiguous:bool=False) -> tuple[MMIOInterface, list[int]]: + return self.dma_view(0xf000 + (off:=self.sram.alloc(size)), size), [0x200000 + off] + + def read_config(self, offset:int, size:int): return self.usb2.pcie_cfg_req(offset, bus=4, dev=0, fn=0, size=size) + def write_config(self, offset:int, value:int, size:int): self.usb2.pcie_cfg_req(offset, bus=4, dev=0, fn=0, value=value, size=size) + + def bar_info(self, bar_idx:int) -> tuple[int, int]: return self._bar_info[bar_idx] # type: ignore[override] + def map_bar(self, bar, off=0, addr=0, size=None, fmt='B'): + return USB2MMIOInterface(self.usb2, self.bar_info(bar)[0] + off, size or self.bar_info(bar)[1], fmt) + def resize_bar(self, bar_idx:int): pass # already resized + @dataclasses.dataclass class PCIAllocationMeta: mapping:VirtMapping; has_cpu_mapping:bool; hMemory:int=0 # noqa: E702 diff --git a/tinygrad/runtime/support/usb.py b/tinygrad/runtime/support/usb.py index b9de1b7e0e..6bcb67df46 100644 --- a/tinygrad/runtime/support/usb.py +++ b/tinygrad/runtime/support/usb.py @@ -35,10 +35,12 @@ class USB3: all_eps = (self.ep_data_out, self.ep_data_in, self.ep_stat_in, self.ep_cmd_out) for ep in all_eps: libusb.libusb_clear_halt(self.handle, ep) - # Allocate streams + # Allocate streams (falls back to no-stream UAS on USB 2.0) stream_eps = (ctypes.c_uint8 * 3)(self.ep_data_out, self.ep_data_in, self.ep_stat_in) - if (rc:=libusb.libusb_alloc_streams(self.handle, self.max_streams * len(stream_eps), stream_eps, len(stream_eps))) < 0: - raise RuntimeError(f"alloc_streams failed: {rc}") + rc = libusb.libusb_alloc_streams(self.handle, self.max_streams * len(stream_eps), stream_eps, len(stream_eps)) + self.use_streams = rc >= 0 + if not self.use_streams: self.max_streams = 1 + self._uas_tag = 0 # Base cmd cmd_template = bytes([0x01, 0x00, 0x00, 0x01, *([0] * 12), 0xE4, 0x24, 0x00, 0xB2, 0x1A, 0x00, 0x00, 0x00, *([0] * 8)]) @@ -127,7 +129,38 @@ class USB3: sig, rtag, residue, status = struct.unpack(" data -> status, unique tag per command + slot = 0 + self.buf_cmd[slot][16:16+len(cdb)] = list(cdb) + self._uas_tag = (self._uas_tag % 255) + 1 # UAS tag must be unique and non-zero + self.buf_cmd[slot][3] = self._uas_tag + + # 1. Send command IU + self._bulk_out(self.ep_cmd_out, bytes(self.buf_cmd[slot])) + + # 2. Data phase + status + if rlen: + if rlen > len(self.buf_data_in[slot]): self.buf_data_in[slot] = (ctypes.c_uint8 * round_up(rlen, 0x1000))() + results.append(self._bulk_in(self.ep_data_in, rlen)) + _stat = self._bulk_in(self.ep_stat_in, 64) + elif send_data is not None: + for _retry in range(10): + _rtt = self._bulk_in(self.ep_stat_in, 64) # Ready-to-Transfer IU or early completion + if _rtt[0] == 0x07: break # RTT: device ready for data + # Device sent Sense/Response instead of RTT, re-send command + self._uas_tag = (self._uas_tag % 255) + 1 + self.buf_cmd[slot][3] = self._uas_tag + self._bulk_out(self.ep_cmd_out, bytes(self.buf_cmd[slot])) + else: raise RuntimeError("UAS: failed to get Ready-to-Transfer after 10 retries") + self._bulk_out(self.ep_data_out, send_data) + _stat = self._bulk_in(self.ep_stat_in, 64) + results.append(None) + else: + # No data phase - just read status + _stat = self._bulk_in(self.ep_stat_in, 64) + results.append(None) else: # allocate slot and stream. stream is 1-based slot, stream = idx % self.max_streams, (idx % self.max_streams) + 1 @@ -209,10 +242,13 @@ class ASM24Controller: def write(self, base_addr:int, data:bytes, ignore_cache:bool=True): return self.exec_ops([WriteOp(base_addr, data, ignore_cache)]) def scsi_write(self, buf:bytes, lba:int=0): - if len(buf) > 0x4000: buf += b'\x00' * (round_up(len(buf), 0x10000) - len(buf)) + #chunk = 0x2000 if not self.usb.use_streams else 0x10000 + chunk = 512 + if len(buf) > 0x4000: buf += b'\x00' * (round_up(len(buf), chunk) - len(buf)) - for i in range(0, len(buf), 0x10000): - self.exec_ops([ScsiWriteOp(buf[i:i+0x10000], lba), WriteOp(0x171, b'\xff\xff\xff', ignore_cache=True)]) + for i in range(0, len(buf), chunk): + self.exec_ops([WriteOp(0x7ef, b'\x00', ignore_cache=True)]) # re-arm SCSI write path + self.exec_ops([ScsiWriteOp(buf[i:i+chunk], lba), WriteOp(0x171, b'\xff\xff\xff', ignore_cache=True)]) self.exec_ops([WriteOp(0xce6e, b'\x00\x00', ignore_cache=True)]) if len(buf) > 0x4000: @@ -325,3 +361,170 @@ class USBMMIOInterface(MMIOInterface): self.usb.pcie_mem_write(self.addr+off, [int.from_bytes(data[i:i+acc_sz], "little") for i in range(0, len(data), acc_sz)], acc_sz) if getenv("MOCKGPU"): from test.mockgpu.usb import MockUSB3 as USB3 # type: ignore # noqa: F811 + +# ============================================================================= +# USB2 controller — uses 0xF0 vendor command with streaming bulk for PCIe access +# ============================================================================= + +MWR64, MRD64, CFGRD0, CFGRD1, CFGWR0, CFGWR1 = 0x60, 0x20, 0x04, 0x05, 0x44, 0x45 +EP_OUT, EP_IN = 0x02, 0x81 +READ_CHUNK = 16 # dwords per bulk IN chunk + +class USB2Controller: + def __init__(self): + self.ctx = ctypes.POINTER(libusb.struct_libusb_context)() + if libusb.libusb_init(ctypes.byref(self.ctx)): raise RuntimeError("libusb_init failed") + self.handle = libusb.libusb_open_device_with_vid_pid(self.ctx, 0xADD1, 0x0001) + if not self.handle: raise RuntimeError("USB2 device ADD1:0001 not found") + if libusb.libusb_kernel_driver_active(self.handle, 0): libusb.libusb_detach_kernel_driver(self.handle, 0) + if libusb.libusb_set_configuration(self.handle, 1): raise RuntimeError("set_configuration failed") + if libusb.libusb_claim_interface(self.handle, 0): raise RuntimeError("claim_interface failed") + self._pci_cacheable: list[tuple[int, int]] = [] + self._pci_cache: dict[int, int|None] = {} + + # -- low-level xdata access (0xE4/0xE5) -- + def xdata_read(self, addr, size=1): + buf = (ctypes.c_ubyte * size)() + ret = libusb.libusb_control_transfer(self.handle, 0xC0, 0xE4, addr, 0, buf, size, 1000) + assert ret >= 0, f"E4 read 0x{addr:04X} failed: {ret}" + return bytes(buf[:ret]) + + def xdata_write(self, addr, val): + ret = libusb.libusb_control_transfer(self.handle, 0x40, 0xE5, addr, val, None, 0, 1000) + assert ret >= 0, f"E5 write 0x{addr:04X}=0x{val:02X} failed: {ret}" + + def read(self, base_addr, length): + return self.xdata_read(base_addr, length) + + def write(self, base_addr, data, ignore_cache=True): + for i, b in enumerate(data): self.xdata_write(base_addr + i, b) + + # -- 0xF0 single TLP -- + def _f0_out(self, fmt_type, be, mode, count, addr_lo, addr_hi, value_be): + wval = fmt_type | (be << 8) + widx = (mode & 0x03) | ((count & 0x3F) << 2) + payload = struct.pack('I', value_be) + buf = (ctypes.c_ubyte * 12)(*payload) + ret = libusb.libusb_control_transfer(self.handle, 0x40, 0xF0, wval, widx, buf, 12, 5000) + assert ret >= 0, f"F0 OUT failed: {ret}" + + def _f0_in(self): + buf = (ctypes.c_ubyte * 8)() + ret = libusb.libusb_control_transfer(self.handle, 0xC0, 0xF0, 0, 0, buf, 8, 5000) + assert ret >= 0, f"F0 IN failed: {ret}" + return bytes(buf) + + def _is_pci_cacheable(self, addr): return any(x <= addr <= x + sz for x, sz in self._pci_cacheable) + + def pcie_request(self, fmt_type, address, value=None, size=4, retries=10): + if fmt_type == 0x60 and size == 4 and self._is_pci_cacheable(address) and self._pci_cache.get(address) == value: return None + if DEBUG >= 5: print("usb2 pcie_request", hex(fmt_type), hex(address), value, size) + self._pci_cache[address] = value if size == 4 and fmt_type == 0x60 else None + + masked, offset = address & 0xFFFFFFFC, address & 0x3 + be = ((1 << size) - 1) << offset + shifted = ((value << (8 * offset)) & 0xFFFFFFFF) if value is not None else 0 + self._f0_out(fmt_type, be, 0, 0, masked & 0xFFFFFFFF, address >> 32, shifted) + is_write = ((fmt_type & 0xDF) == 0x40) or ((fmt_type & 0xB8) == 0x30) + if is_write: return None + result = self._f0_in() + fw_status = result[7] + if fw_status == 0x01: + if retries > 0: return self.pcie_request(fmt_type, address, value, size, retries - 1) + raise RuntimeError(f"Unsupported Request at 0x{address:08X}") + if fw_status == 0xFF: raise TimeoutError(f"PCIe completion timeout at 0x{address:08X}") + raw = struct.unpack('>I', result[0:4])[0] + return (raw >> (8 * offset)) & ((1 << (8 * size)) - 1) + + def pcie_cfg_req(self, byte_addr, bus=1, dev=0, fn=0, value=None, size=4): + fmt = (CFGWR1 if value is not None else CFGRD1) if bus > 0 else (CFGWR0 if value is not None else CFGRD0) + address = (bus << 24) | (dev << 19) | (fn << 16) | (byte_addr & 0xFFF) + return self.pcie_request(fmt, address, value, size) + + def pcie_mem_req(self, address, value=None, size=4): + return self.pcie_request(MWR64 if value is not None else MRD64, address, value, size) + + # -- streaming bulk (mode 1=write, mode 2=read) -- + def _dma_setup(self, addr, mode, count=0): + fmt = {0: 0, 1: MWR64, 2: MRD64}[mode] + self._f0_out(fmt, 0x0F, mode, count, addr & 0xFFFFFFFF, addr >> 32, 0) + + def stream_write(self, addr, data): + """Streaming write to PCIe address via bulk OUT.""" + import time + self._dma_setup(addr, 1) + buf = (ctypes.c_ubyte * len(data)).from_buffer_copy(data) + transferred = ctypes.c_int() + ret = libusb.libusb_bulk_transfer(self.handle, EP_OUT, buf, len(data), ctypes.byref(transferred), 30000) + assert ret == 0, f"bulk write failed: {ret}" + time.sleep(0.001) + + def stream_read(self, addr, nbytes, chunk=READ_CHUNK): + """Streaming read from PCIe address via bulk IN.""" + self._dma_setup(addr, 2, chunk) + chunk_bytes = chunk * 4 + resp = (ctypes.c_ubyte * chunk_bytes)() + result = bytearray() + transferred = ctypes.c_int() + while len(result) < nbytes: + ret = libusb.libusb_bulk_transfer(self.handle, EP_IN, resp, chunk_bytes, ctypes.byref(transferred), 5000) + assert ret == 0, f"bulk read failed: {ret}" + result.extend(bytes(resp[:transferred.value])) + return bytes(result[:nbytes]) + +class USB2MMIOInterface(MMIOInterface): + def __init__(self, usb:USB2Controller, addr:int, size:int, fmt='B', pcimem=True): + self.usb, self.addr, self.nbytes, self.fmt, self.pcimem = usb, addr, size, fmt, pcimem + self.el_sz = struct.calcsize(fmt) + + def __getitem__(self, index): return self._access(index) + def __setitem__(self, index, val): self._access(index, val) + + def view(self, offset=0, size=None, fmt=None): + return USB2MMIOInterface(self.usb, self.addr + offset, size or (self.nbytes - offset), fmt=fmt or self.fmt, pcimem=self.pcimem) + + def _access(self, index, val=None): + if isinstance(index, slice): + start, stop = (index.start or 0) * self.el_sz, (index.stop or len(self)) * self.el_sz + return self._acc_range(start, stop - start, val) + return self._acc_one(index * self.el_sz, self.el_sz, val) if self.pcimem else self._acc_range(index * self.el_sz, self.el_sz, val) + + def _acc_one(self, off, sz, val=None): + """Single dword access via single TLP.""" + upper = 0 if sz < 8 else self.usb.pcie_mem_req(self.addr + off + 4, val if val is None else (val >> 32), 4) + lower = self.usb.pcie_mem_req(self.addr + off, val if val is None else val & 0xffffffff, min(sz, 4)) + if val is None: return lower | (upper << 32) + + def _acc_range(self, off, sz, data=None): + """Range access — uses streaming bulk for large PCIe transfers, single TLP for small/non-PCIe.""" + if data is None: # read + if not self.pcimem: + raw = self.usb.xdata_read(self.addr + off, sz) + return int.from_bytes(raw, "little") if sz == self.el_sz else raw + if sz >= 64: # streaming read for large transfers + raw = self.usb.stream_read(self.addr + off, sz) + # Convert from big-endian dwords to host byte order + arr = array.array('I') + arr.frombytes(raw[:len(raw) - len(raw) % 4]) + arr.byteswap() + return bytes(arr) + # small read: use single TLPs + acc_sz = 4 if sz % 4 == 0 else (2 if sz % 2 == 0 else 1) + return bytes(array.array('I' if acc_sz == 4 else ('H' if acc_sz == 2 else 'B'), + [self._acc_one(off + i * acc_sz, acc_sz) for i in range(sz // acc_sz)])) + # write + data = struct.pack(self.fmt, data) if isinstance(data, int) else bytes(data) + if not self.pcimem: + for i, b in enumerate(data): self.usb.xdata_write(self.addr + off + i, b) + return + if len(data) >= 64: # streaming write for large transfers + # Convert from host byte order to big-endian dwords + arr = array.array('I') + arr.frombytes(data + b'\x00' * ((-len(data)) % 4)) + arr.byteswap() + self.usb.stream_write(self.addr + off, arr.tobytes()[:len(data)]) + return + # small write: use single TLPs + acc_sz = 4 if len(data) % 4 == 0 else (2 if len(data) % 2 == 0 else 1) + for i in range(0, len(data), acc_sz): + self.usb.pcie_mem_req(self.addr + off + i, int.from_bytes(data[i:i+acc_sz], "little"), acc_sz)