diff --git a/.pylintrc b/.pylintrc index 2f1de51927..dc51be94d7 100644 --- a/.pylintrc +++ b/.pylintrc @@ -30,10 +30,6 @@ persistent=yes # Specify a configuration file. #rcfile= -# When enabled, pylint would attempt to guess common misconfiguration and emit -# user-friendly hints instead of false-positive error messages -suggestion-mode=yes - # Allow loading of arbitrary C extensions. Extensions are imported into the # active Python interpreter and may run arbitrary code. unsafe-load-any-extension=no diff --git a/tinygrad/helpers.py b/tinygrad/helpers.py index 45f1c9cf58..00e2de83d2 100644 --- a/tinygrad/helpers.py +++ b/tinygrad/helpers.py @@ -153,7 +153,7 @@ CORRECT_DIVMOD_FOLDING, FUSE_OPTIM = ContextVar("CORRECT_DIVMOD_FOLDING", 0), Co ALLOW_DEVICE_USAGE, MAX_BUFFER_SIZE = ContextVar("ALLOW_DEVICE_USAGE", 1), ContextVar("MAX_BUFFER_SIZE", 0) FUSE_ATTENTION = ContextVar("FUSE_ATTENTION", 0) EMULATE = ContextVar("EMULATE", "") -CPU_COUNT = ContextVar("CPU_COUNT", max(1, len(aff(0)) if (aff:=getattr(os, "sched_getaffinity", None)) else (os.cpu_count() or 1))) +CPU_COUNT = ContextVar("CPU_COUNT", max(1, len(os.sched_getaffinity(0)) if (aff:=getattr(os, "sched_getaffinity", None)) else (os.cpu_count() or 1))) CPU_LLVM, AMD_LLVM = ContextVar("CPU_LLVM", 0), ContextVar("AMD_LLVM", 1) VIZ = PROFILE = ContextVar("VIZ", 0) SPEC = ContextVar("SPEC", 0) @@ -352,10 +352,10 @@ def capstone_flatdump(lib: bytes): print(f"{instr.address:#08x}: {instr.mnemonic}\t{instr.op_str}") sys.stdout.flush() -def wait_cond(cb, value=True, timeout_ms=10000, msg="") -> bool: +def wait_cond(cb, *args, value=True, timeout_ms=10000, msg="") -> bool: start_time = int(time.perf_counter() * 1000) while int(time.perf_counter() * 1000) - start_time < timeout_ms: - if (val:=cb()) == value: return val + if (val:=cb(*args)) == value: return val raise TimeoutError(f"{msg}. Timed out after {timeout_ms} ms, condition not met: {val} != {value}") # *** ctypes helpers diff --git a/tinygrad/runtime/ops_amd.py b/tinygrad/runtime/ops_amd.py index e901974a10..af239b8948 100644 --- a/tinygrad/runtime/ops_amd.py +++ b/tinygrad/runtime/ops_amd.py @@ -713,7 +713,7 @@ class PCIIface(PCIIfaceBase): def device_fini(self): self.dev_impl.fini() class USBIface(PCIIface): - def __init__(self, dev, dev_id): + def __init__(self, dev, dev_id): # pylint: disable=super-init-not-called self.dev = dev self.usb = ASM24Controller() self.bars = setup_pci_bars(self.usb, gpu_bus=4, mem_base=0x10000000, pref_mem_base=(32 << 30)) diff --git a/tinygrad/runtime/ops_remote.py b/tinygrad/runtime/ops_remote.py index 5c0c056a72..12c80cf255 100644 --- a/tinygrad/runtime/ops_remote.py +++ b/tinygrad/runtime/ops_remote.py @@ -424,7 +424,7 @@ class RemoteConnection: conns = RemoteConnection.all.keys() datas = {conn: conn.req.serialize() for conn in conns} reqs, hashes, hash_datas = sum(len(c.req._q) for c in conns), sum(len(c.req._h) for c in conns), sum(len(data) for data in datas.values()) - resps = [] + ret, resps = None, [] with Timing(f"*** send {reqs:-3d} requests {hashes:-3d} hashes with len {hash_datas/1024:.2f} kB in ", enabled=DEBUG>=3): for conn,data in datas.items(): conn.conn.request("POST", "/batch", data) for conn in datas.keys(): diff --git a/tinygrad/runtime/support/am/ip.py b/tinygrad/runtime/support/am/ip.py index e6ff7a24e2..7dc47643d8 100644 --- a/tinygrad/runtime/support/am/ip.py +++ b/tinygrad/runtime/support/am/ip.py @@ -113,7 +113,7 @@ class AM_GMC(AM_IP): for eng_i in range(18): self.adev.wreg_pair(f"reg{ip}VM_INVALIDATE_ENG{eng_i}_ADDR_RANGE", "_LO32", "_HI32", 0x1fffffffff) self.hub_initted[ip] = True - @functools.cache + @functools.cache # pylint: disable=method-cache-max-size-none def get_pte_flags(self, pte_lv, is_table, frag, uncached, system, snooped, valid, extra=0): extra |= (am.AMDGPU_PTE_SYSTEM * system) | (am.AMDGPU_PTE_SNOOPED * snooped) | (am.AMDGPU_PTE_VALID * valid) | am.AMDGPU_PTE_FRAG(frag) if not is_table: extra |= (am.AMDGPU_PTE_WRITEABLE | am.AMDGPU_PTE_READABLE | am.AMDGPU_PTE_EXECUTABLE) @@ -175,7 +175,7 @@ class AM_SMU(AM_IP): def _send_msg(self, msg:int, param:int, read_back_arg=False, timeout=10000, debug=False): # default timeout is 10 seconds self._smu_cmn_send_msg(msg, param, debug=debug) - wait_cond(lambda: (self.adev.mmMP1_SMN_C2PMSG_90 if not debug else self.adev.mmMP1_SMN_C2PMSG_54).read(), value=1, timeout_ms=timeout, + wait_cond((self.adev.mmMP1_SMN_C2PMSG_90 if not debug else self.adev.mmMP1_SMN_C2PMSG_54).read, value=1, timeout_ms=timeout, msg=f"SMU msg {msg:#x} timeout") return (self.adev.mmMP1_SMN_C2PMSG_82 if not debug else self.adev.mmMP1_SMN_C2PMSG_53).read() if read_back_arg else None diff --git a/tinygrad/runtime/support/elf.py b/tinygrad/runtime/support/elf.py index 3276e6adb8..3e5f61bafd 100644 --- a/tinygrad/runtime/support/elf.py +++ b/tinygrad/runtime/support/elf.py @@ -33,7 +33,7 @@ def elf_loader(blob:bytes, force_section_align:int=1) -> tuple[memoryview, list[ for sh, trgt_sh_name, c_rels in rel + rela: target_image_off = next(tsh for tsh in sections if tsh.name == trgt_sh_name).header.sh_addr rels = [(r.r_offset, symtab[libc.ELF64_R_SYM(r.r_info)], libc.ELF64_R_TYPE(r.r_info), getattr(r, "r_addend", 0)) for r in c_rels] - for roff, sym, r_type_, r_addend in rels: + for _, sym, _, _ in rels: if sym.st_shndx == 0: raise RuntimeError(f'Attempting to relocate against an undefined symbol {repr(_strtab(sh_strtab, sym.st_name))}') relocs += [(target_image_off + roff, sections[sym.st_shndx].header.sh_addr + sym.st_value, rtype, raddend) for roff, sym, rtype, raddend in rels] diff --git a/tinygrad/runtime/support/memory.py b/tinygrad/runtime/support/memory.py index e5624515e5..1c22c1ecd9 100644 --- a/tinygrad/runtime/support/memory.py +++ b/tinygrad/runtime/support/memory.py @@ -30,10 +30,10 @@ class TLSFAllocator: self.blocks:dict[int, tuple[int, int|None, int|None, bool]] = {0: (size, None, None, True)} # size, next, prev, is_free self._insert_block(0, size) - @functools.cache + @functools.cache # pylint: disable=method-cache-max-size-none def lv1(self, size): return size.bit_length() - @functools.cache + @functools.cache # pylint: disable=method-cache-max-size-none def lv2(self, size): return (size - (1 << (size.bit_length() - 1))) // (1 << max(0, size.bit_length() - self.l2_cnt)) def _insert_block(self, start:int, size:int, prev:int|None=None): @@ -209,7 +209,7 @@ class MemoryManager: if getenv("MM_DEBUG", 0): print(f"mm {self.dev.devfmt}: unmapping {vaddr=:#x} ({size=:#x})") ctx = PageTableTraverseContext(self.dev, self.root_page_table, vaddr, free_pts=True) - for off, pt, pte_idx, pte_cnt, pte_covers in ctx.next(size): + for _, pt, pte_idx, pte_cnt, _ in ctx.next(size): for pte_id in range(pte_idx, pte_idx + pte_cnt): assert pt.valid(pte_id), f"PTE not mapped: {pt.entry(pte_id):#x}" pt.set_entry(pte_id, paddr=0x0, valid=False) diff --git a/tinygrad/runtime/support/nv/ip.py b/tinygrad/runtime/support/nv/ip.py index 2037960215..eda20117e6 100644 --- a/tinygrad/runtime/support/nv/ip.py +++ b/tinygrad/runtime/support/nv/ip.py @@ -124,6 +124,7 @@ class NV_FLCN(NV_IP): def __patch(cmd_id, cmd): patched_image = bytearray(image) + dmem_offset = 0 hdr = nv.FALCON_APPLICATION_INTERFACE_HEADER_V1.from_buffer_copy(image[(app_hdr_off:=self.desc_v3.IMEMLoadSize+self.desc_v3.InterfaceOffset):]) ents = (nv.FALCON_APPLICATION_INTERFACE_ENTRY_V1 * hdr.entryCount).from_buffer_copy(image[app_hdr_off + ctypes.sizeof(hdr):]) for i in range(hdr.entryCount): @@ -334,7 +335,7 @@ class NV_GSP(NV_IP): # Fill up arguments queue_args = nv.MESSAGE_QUEUE_INIT_ARGUMENTS(sharedMemPhysAddr=queues_sysmem[0], pageTableEntryCount=pte_cnt, cmdQueueOffset=pt_size, statQueueOffset=pt_size + queue_size) - rm_args, self.rm_args_sysmem = self.nvdev._alloc_boot_struct(nv.GSP_ARGUMENTS_CACHED(bDmemStack=True, messageQueueInitArguments=queue_args)) + _, self.rm_args_sysmem = self.nvdev._alloc_boot_struct(nv.GSP_ARGUMENTS_CACHED(bDmemStack=True, messageQueueInitArguments=queue_args)) # Build command queue header self.cmd_q_va, self.stat_q_va = queues_va + pt_size, queues_va + pt_size + queue_size @@ -481,7 +482,7 @@ class NV_GSP(NV_IP): params.ramfcMem = nv_gpu.NV_MEMORY_DESC_PARAMS(base=ramfc_alloc.paddrs[0][0], size=0x200, addressSpace=2, cacheAttrib=0) params.instanceMem = nv_gpu.NV_MEMORY_DESC_PARAMS(base=ramfc_alloc.paddrs[0][0], size=0x1000, addressSpace=2, cacheAttrib=0) - method_va, method_sysmem = System.alloc_sysmem(0x5000, contiguous=True) + _, method_sysmem = System.alloc_sysmem(0x5000, contiguous=True) params.mthdbufMem = nv_gpu.NV_MEMORY_DESC_PARAMS(base=method_sysmem[0], size=0x5000, addressSpace=1, cacheAttrib=0) if client is not None and client != self.priv_root and params.hObjectError != 0: @@ -557,7 +558,7 @@ class NV_GSP(NV_IP): self.nvdev.wreg(addr, (self.nvdev.rreg(addr) & ~mask) | (val & mask)) elif op == 0x2: # reg poll addr, mask, val, _, _ = next(cmd_iter), next(cmd_iter), next(cmd_iter), next(cmd_iter), next(cmd_iter) - wait_cond(lambda: (self.nvdev.rreg(addr) & mask), value=val, msg=f"Register {addr:#x} not equal to {val:#x} after polling") + wait_cond(lambda a, m: (self.nvdev.rreg(a) & m), addr, mask, value=val, msg=f"Register {addr:#x} not equal to {val:#x} after polling") elif op == 0x3: time.sleep(next(cmd_iter) / 1e6) # delay us elif op == 0x4: # save reg addr, index = next(cmd_iter), next(cmd_iter) diff --git a/tinygrad/runtime/support/nv/nvdev.py b/tinygrad/runtime/support/nv/nvdev.py index 6831b5e8b1..496d8ec5c8 100644 --- a/tinygrad/runtime/support/nv/nvdev.py +++ b/tinygrad/runtime/support/nv/nvdev.py @@ -152,6 +152,8 @@ class NVDev(PCIDevImplBase): return gzip.decompress(struct.pack("<4BL2B", 0x1f, 0x8b, 8, 0, 0, 0, 3) + image) if "COMPRESSION: YES" in info else image def include(self, file:str): + def _do_eval(s:str): return eval(s) # pylint: disable=eval-used + regs_off = {'NV_PFALCON_FALCON': 0x0, 'NV_PGSP_FALCON': 0x0, 'NV_PSEC_FALCON': 0x0, 'NV_PRISCV_RISCV': 0x1000, 'NV_PGC6_AON': 0x0, 'NV_PFSP': 0x0, 'NV_PGC6_BSI': 0x0, 'NV_PFALCON_FBIF': 0x600, 'NV_PFALCON2_FALCON': 0x1000, 'NV_PBUS': 0x0, 'NV_PFB': 0x0, 'NV_PMC': 0x0, 'NV_PGSP_QUEUE': 0x0, 'NV_VIRTUAL_FUNCTION':0xb80000} @@ -163,13 +165,13 @@ class NVDev(PCIDevImplBase): name, hi, lo = m.groups() reg = next((r for r in self.reg_names if name.startswith(r+"_")), None) - if reg is not None: self.__dict__[reg].add_field(name[len(reg)+1:].lower(), eval(lo), eval(hi)) - else: self.reg_offsets[name] = (eval(lo), eval(hi)) + if reg is not None: self.__dict__[reg].add_field(name[len(reg)+1:].lower(), _do_eval(lo), _do_eval(hi)) + else: self.reg_offsets[name] = (_do_eval(lo), _do_eval(hi)) continue if m:=re.match(r'#define\s+(\w+)\s*\(\s*(\w+)\s*\)\s*(.+)', raw): # reg set fn = m.groups()[2].strip().rstrip('\\').split('/*')[0].rstrip() - name, value = m.groups()[0], eval(f"lambda {m.groups()[1]}: {fn}") + name, value = m.groups()[0], _do_eval(f"lambda {m.groups()[1]}: {fn}") elif m:=re.match(r'#define\s+(\w+)\s+([0-9A-Fa-fx]+)(?![^\n]*:)', raw): name, value = m.groups()[0], int(m.groups()[1], 0) # reg value else: continue diff --git a/tinygrad/runtime/support/system.py b/tinygrad/runtime/support/system.py index 66b2f78615..df575b89fe 100644 --- a/tinygrad/runtime/support/system.py +++ b/tinygrad/runtime/support/system.py @@ -10,14 +10,14 @@ MAP_FIXED, MAP_LOCKED, MAP_POPULATE, MAP_NORESERVE = 0x10, 0 if OSX else 0x2000, class _System: def reserve_hugepages(self, cnt): os.system(f"sudo sh -c 'echo {cnt} > /proc/sys/vm/nr_hugepages'") - def memory_barrier(self): lib.atomic_thread_fence(__ATOMIC_SEQ_CST:=5) if (lib:=self.atomic_lib()) is not None else None + def memory_barrier(self): lib.atomic_thread_fence(__ATOMIC_SEQ_CST:=5) if (lib:=self.atomic_lib) is not None else None def lock_memory(self, addr:int, size:int): if libc.mlock(ctypes.c_void_p(addr), size): raise RuntimeError(f"Failed to lock memory at {addr:#x} with size {size:#x}") def system_paddrs(self, vaddr:int, size:int) -> list[int]: - self.pagemap().seek(vaddr // mmap.PAGESIZE * 8) - return [(x & ((1<<55) - 1)) * mmap.PAGESIZE for x in array.array('Q', self.pagemap().read(size//mmap.PAGESIZE*8, binary=True))] + self.pagemap.seek(vaddr // mmap.PAGESIZE * 8) + return [(x & ((1<<55) - 1)) * mmap.PAGESIZE for x in array.array('Q', self.pagemap.read(size//mmap.PAGESIZE*8, binary=True))] def alloc_sysmem(self, size:int, vaddr:int=0, contiguous:bool=False, data:bytes|None=None) -> tuple[int, list[int]]: assert not contiguous or size <= (2 << 20), "Contiguous allocation is only supported for sizes up to 2MB" @@ -36,17 +36,17 @@ class _System: if vendor == target_vendor and device in target_devices: result.append(pcibus) return sorted(result) - @functools.cache + @functools.cached_property def atomic_lib(self): return ctypes.CDLL(ctypes.util.find_library('atomic')) if sys.platform == "linux" else None - @functools.cache + @functools.cached_property def pagemap(self) -> FileIOInterface: if FileIOInterface(reloc_sysfs:="/proc/sys/vm/compact_unevictable_allowed", os.O_RDONLY).read()[0] != "0": os.system(cmd:=f"sudo sh -c 'echo 0 > {reloc_sysfs}'") assert FileIOInterface(reloc_sysfs, os.O_RDONLY).read()[0] == "0", f"Failed to disable migration of locked pages. Please run {cmd} manually." return FileIOInterface("/proc/self/pagemap", os.O_RDONLY) - @functools.cache + @functools.cached_property def vfio(self) -> FileIOInterface|None: try: if not FileIOInterface.exists("/sys/module/vfio"): os.system("sudo modprobe vfio-pci disable_idle_d3=1") @@ -90,7 +90,7 @@ class PCIDevice: " to allow python accessing device or run with sudo") from e raise RuntimeError(f"Cannot resize BAR {i}: {e}. Ensure the resizable BAR option is enabled on your system.") from e - if getenv("VFIO", 0) and (vfio_fd:=System.vfio()) is not None: + if getenv("VFIO", 0) and (vfio_fd:=System.vfio) is not None: FileIOInterface(f"/sys/bus/pci/devices/{self.pcibus}/driver_override", os.O_WRONLY).write("vfio-pci") FileIOInterface("/sys/bus/pci/drivers_probe", os.O_WRONLY).write(self.pcibus) iommu_group = FileIOInterface.readlink(f"/sys/bus/pci/devices/{self.pcibus}/iommu_group").split('/')[-1] diff --git a/tinygrad/runtime/support/usb.py b/tinygrad/runtime/support/usb.py index 285e3cf287..2340c944cb 100644 --- a/tinygrad/runtime/support/usb.py +++ b/tinygrad/runtime/support/usb.py @@ -229,7 +229,7 @@ class ASM24Controller: for i in range(0, len(ops), bs:=(4 if OSX else 16)): self.exec_ops(list(itertools.chain.from_iterable(ops[i:i+bs]))) class USBMMIOInterface(MMIOInterface): - def __init__(self, usb, addr, size, fmt, pcimem=True): + def __init__(self, usb, addr, size, fmt, pcimem=True): # pylint: disable=super-init-not-called self.usb, self.addr, self.nbytes, self.fmt, self.pcimem, self.el_sz = usb, addr, size, fmt, pcimem, struct.calcsize(fmt) def __getitem__(self, index): return self._access_items(index) @@ -256,13 +256,14 @@ class USBMMIOInterface(MMIOInterface): acc, acc_size = self._acc_size(sz) return bytes(array.array(acc, [self._acc_one(off + i * acc_size, acc_size) for i in range(sz // acc_size)])) - else: # write op - data = struct.pack(self.fmt, data) if isinstance(data, int) else bytes(data) - if not self.pcimem: - # Fast path for writing into buffer 0xf000 - use_cache = 0xa800 <= self.addr <= 0xb000 - return self.usb.scsi_write(bytes(data)) if self.addr == 0xf000 else self.usb.write(self.addr + off, bytes(data), ignore_cache=not use_cache) + # write op + data = struct.pack(self.fmt, data) if isinstance(data, int) else bytes(data) - _, acc_sz = self._acc_size(len(data) * struct.calcsize(self.fmt)) - self.usb.pcie_mem_write(self.addr+off, [int.from_bytes(data[i:i+acc_sz], "little") for i in range(0, len(data), acc_sz)], acc_sz) + if not self.pcimem: + # Fast path for writing into buffer 0xf000 + use_cache = 0xa800 <= self.addr <= 0xb000 + return self.usb.scsi_write(bytes(data)) if self.addr == 0xf000 else self.usb.write(self.addr + off, bytes(data), ignore_cache=not use_cache) + + _, acc_sz = self._acc_size(len(data) * struct.calcsize(self.fmt)) + self.usb.pcie_mem_write(self.addr+off, [int.from_bytes(data[i:i+acc_sz], "little") for i in range(0, len(data), acc_sz)], acc_sz)