From a825608dc239f63f321e0ca2cd7d251f28fa4f69 Mon Sep 17 00:00:00 2001 From: nimlgen <138685161+nimlgen@users.noreply.github.com> Date: Thu, 15 May 2025 23:26:48 +0300 Subject: [PATCH] hcq: fix progs' __del__ when shutdown (#10341) * debug ci * better? * and mute this? * revrt that --- tinygrad/runtime/ops_amd.py | 8 +++----- tinygrad/runtime/ops_nv.py | 8 +++----- tinygrad/runtime/ops_qcom.py | 8 +++----- tinygrad/runtime/support/hcq.py | 3 +++ 4 files changed, 12 insertions(+), 15 deletions(-) diff --git a/tinygrad/runtime/ops_amd.py b/tinygrad/runtime/ops_amd.py index 65ea6e4912..2461ef04ab 100644 --- a/tinygrad/runtime/ops_amd.py +++ b/tinygrad/runtime/ops_amd.py @@ -1,6 +1,6 @@ from __future__ import annotations from typing import Any, cast, ClassVar -import os, ctypes, ctypes.util, struct, hashlib, functools, importlib, mmap, errno, array, contextlib, sys, select +import os, ctypes, ctypes.util, struct, hashlib, functools, importlib, mmap, errno, array, contextlib, sys, select, weakref assert sys.platform != 'win32' from dataclasses import dataclass from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQSignal, HCQProgram, FileIOInterface @@ -425,7 +425,7 @@ class AMDProgram(HCQProgram): self.dev, self.name, self.lib = dev, name, lib image, sections, _ = elf_loader(self.lib) - self.lib_gpu = self.dev.allocator.alloc(round_up(image.nbytes, 0x1000), BufferSpec(cpu_access=True, nolru=True)) + self.lib_gpu = self.dev.allocator.alloc(round_up(image.nbytes, 0x1000), buf_spec:=BufferSpec(cpu_access=True, nolru=True)) self.dev.allocator._copyin(self.lib_gpu, image) self.dev.synchronize() @@ -461,9 +461,7 @@ class AMDProgram(HCQProgram): super().__init__(CLikeArgsState, self.dev, self.name, kernargs_alloc_size=self.kernargs_segment_size+additional_alloc_sz, lib=self.lib, base=self.lib_gpu.va_addr) - - def __del__(self): - if hasattr(self, 'lib_gpu'): self.dev.allocator.free(self.lib_gpu, self.lib_gpu.size, BufferSpec(cpu_access=True, nolru=True)) + weakref.finalize(self, self._fini, self.dev, self.lib_gpu, buf_spec) class AMDAllocator(HCQAllocator['AMDDevice']): def __init__(self, dev:AMDDevice): diff --git a/tinygrad/runtime/ops_nv.py b/tinygrad/runtime/ops_nv.py index 6a9987ec49..c45341408b 100644 --- a/tinygrad/runtime/ops_nv.py +++ b/tinygrad/runtime/ops_nv.py @@ -1,5 +1,5 @@ from __future__ import annotations -import os, ctypes, contextlib, re, functools, mmap, struct, array, sys +import os, ctypes, contextlib, re, functools, mmap, struct, array, sys, weakref assert sys.platform != 'win32' from typing import Any, cast, Union, Type, ClassVar from dataclasses import dataclass @@ -205,7 +205,7 @@ class NVProgram(HCQProgram): else: image, sections, relocs = elf_loader(self.lib, force_section_align=128) # NOTE: Ensure at least 4KB of space after the program to mitigate prefetch memory faults. - self.lib_gpu = self.dev.allocator.alloc(round_up(image.nbytes, 0x1000) + 0x1000, BufferSpec(cpu_access=True)) + self.lib_gpu = self.dev.allocator.alloc(round_up(image.nbytes, 0x1000) + 0x1000, buf_spec:=BufferSpec(cpu_access=True)) self.prog_addr, self.prog_sz, self.regs_usage, self.shmem_usage, self.lcmem_usage = self.lib_gpu.va_addr, image.nbytes, 0, 0x400, 0 self.constbufs: dict[int, tuple[int, int]] = {0: (0, 0x160)} # dict[constbuf index, tuple[va_addr, size]] @@ -256,6 +256,7 @@ class NVProgram(HCQProgram): # NV's kernargs is constbuffer, then arguments to the kernel follows. Kernargs also appends QMD at the end of the kernel. super().__init__(NVArgsState, self.dev, self.name, kernargs_alloc_size=round_up(self.constbufs[0][1], 1 << 8) + (8 << 8)) + weakref.finalize(self, self._fini, self.dev, self.lib_gpu, buf_spec) def _parse_elf_info(self, sh, start_off=0): while start_off < sh.header.sh_size: @@ -263,9 +264,6 @@ class NVProgram(HCQProgram): yield typ, param, sh.content[start_off+4:start_off+sz+4] if typ == 0x4 else sz start_off += (sz if typ == 0x4 else 0) + 4 - def __del__(self): - if hasattr(self, 'lib_gpu'): self.dev.allocator.free(self.lib_gpu, self.lib_gpu.size, BufferSpec(cpu_access=True)) - def __call__(self, *bufs, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1), vals:tuple[int, ...]=(), wait=False): if prod(local_size) > 1024 or self.max_threads < prod(local_size) or self.lcmem_usage > cast(NVDevice, self.dev).slm_per_thread: raise RuntimeError(f"Too many resources requested for launch, {prod(local_size)=}, {self.max_threads=}") diff --git a/tinygrad/runtime/ops_qcom.py b/tinygrad/runtime/ops_qcom.py index 6be2bd60f1..08501812dc 100644 --- a/tinygrad/runtime/ops_qcom.py +++ b/tinygrad/runtime/ops_qcom.py @@ -1,5 +1,5 @@ from __future__ import annotations -import os, ctypes, functools, mmap, struct, array, math, sys +import os, ctypes, functools, mmap, struct, array, math, sys, weakref assert sys.platform != 'win32' from types import SimpleNamespace from typing import Any, cast, ClassVar @@ -193,7 +193,7 @@ class QCOMProgram(HCQProgram): self.name, self.lib = name, lib self._parse_lib() - self.lib_gpu: HCQBuffer = self.dev.allocator.alloc(self.image_size, options=BufferSpec(cpu_access=True, nolru=True)) + self.lib_gpu: HCQBuffer = self.dev.allocator.alloc(self.image_size, buf_spec:=BufferSpec(cpu_access=True, nolru=True)) to_mv(cast(int, self.lib_gpu.va_addr), self.image_size)[:] = self.image self.pvtmem_size_per_item: int = round_up(self.pvtmem, 512) >> 9 @@ -205,6 +205,7 @@ class QCOMProgram(HCQProgram): kernargs_alloc_size = round_up(2048 + (self.tex_cnt + self.ibo_cnt) * 0x40 + self.samp_cnt * 0x10, 0x100) super().__init__(QCOMArgsState, self.dev, self.name, kernargs_alloc_size=kernargs_alloc_size) + weakref.finalize(self, self._fini, self.dev, self.lib_gpu, buf_spec) def __call__(self, *bufs, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1), vals:tuple[int, ...]=(), wait=False): if self.max_threads < prod(local_size): raise RuntimeError("Too many resources requested for launch") @@ -262,9 +263,6 @@ class QCOMProgram(HCQProgram): reg_desc_off = _read_lib(0x34) self.fregs, self.hregs = _read_lib(reg_desc_off + 0x14), _read_lib(reg_desc_off + 0x18) - def __del__(self): - if hasattr(self, 'lib_gpu'): self.dev.allocator.free(self.lib_gpu, self.lib_gpu.size, options=BufferSpec(cpu_access=True, nolru=True)) - class QCOMTextureInfo: def __init__(self, pitch:int, real_stride:int, desc:list[int], ibo:list[int]): self.pitch, self.real_stride, self.desc, self.ibo = pitch, real_stride, desc, ibo diff --git a/tinygrad/runtime/support/hcq.py b/tinygrad/runtime/support/hcq.py index 7476337e5a..d82a3d1daa 100644 --- a/tinygrad/runtime/support/hcq.py +++ b/tinygrad/runtime/support/hcq.py @@ -301,6 +301,9 @@ class HCQProgram(Generic[HCQDeviceType]): self.args_state_t, self.dev, self.name, self.kernargs_alloc_size = args_state_t, dev, name, kernargs_alloc_size if PROFILE: Compiled.profile_events += [ProfileProgramEvent(dev.device, name, lib, base)] + @staticmethod + def _fini(dev, buf, spec): dev.allocator.free(buf, buf.size, spec) + def fill_kernargs(self, bufs:tuple[HCQBuffer, ...], vals:tuple[int, ...]=(), kernargs:HCQBuffer|None=None) -> HCQArgsState: """ Fills arguments for the kernel, optionally allocating space from the device if `kernargs_ptr` is not provided.