hcq: fix progs' __del__ when shutdown (#10341)

* debug ci

* better?

* and mute this?

* revrt that
This commit is contained in:
nimlgen
2025-05-15 23:26:48 +03:00
committed by GitHub
parent 47b3055fe2
commit a825608dc2
4 changed files with 12 additions and 15 deletions

View File

@@ -1,6 +1,6 @@
from __future__ import annotations
from typing import Any, cast, ClassVar
import os, ctypes, ctypes.util, struct, hashlib, functools, importlib, mmap, errno, array, contextlib, sys, select
import os, ctypes, ctypes.util, struct, hashlib, functools, importlib, mmap, errno, array, contextlib, sys, select, weakref
assert sys.platform != 'win32'
from dataclasses import dataclass
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQSignal, HCQProgram, FileIOInterface
@@ -425,7 +425,7 @@ class AMDProgram(HCQProgram):
self.dev, self.name, self.lib = dev, name, lib
image, sections, _ = elf_loader(self.lib)
self.lib_gpu = self.dev.allocator.alloc(round_up(image.nbytes, 0x1000), BufferSpec(cpu_access=True, nolru=True))
self.lib_gpu = self.dev.allocator.alloc(round_up(image.nbytes, 0x1000), buf_spec:=BufferSpec(cpu_access=True, nolru=True))
self.dev.allocator._copyin(self.lib_gpu, image)
self.dev.synchronize()
@@ -461,9 +461,7 @@ class AMDProgram(HCQProgram):
super().__init__(CLikeArgsState, self.dev, self.name, kernargs_alloc_size=self.kernargs_segment_size+additional_alloc_sz, lib=self.lib,
base=self.lib_gpu.va_addr)
def __del__(self):
if hasattr(self, 'lib_gpu'): self.dev.allocator.free(self.lib_gpu, self.lib_gpu.size, BufferSpec(cpu_access=True, nolru=True))
weakref.finalize(self, self._fini, self.dev, self.lib_gpu, buf_spec)
class AMDAllocator(HCQAllocator['AMDDevice']):
def __init__(self, dev:AMDDevice):

View File

@@ -1,5 +1,5 @@
from __future__ import annotations
import os, ctypes, contextlib, re, functools, mmap, struct, array, sys
import os, ctypes, contextlib, re, functools, mmap, struct, array, sys, weakref
assert sys.platform != 'win32'
from typing import Any, cast, Union, Type, ClassVar
from dataclasses import dataclass
@@ -205,7 +205,7 @@ class NVProgram(HCQProgram):
else: image, sections, relocs = elf_loader(self.lib, force_section_align=128)
# NOTE: Ensure at least 4KB of space after the program to mitigate prefetch memory faults.
self.lib_gpu = self.dev.allocator.alloc(round_up(image.nbytes, 0x1000) + 0x1000, BufferSpec(cpu_access=True))
self.lib_gpu = self.dev.allocator.alloc(round_up(image.nbytes, 0x1000) + 0x1000, buf_spec:=BufferSpec(cpu_access=True))
self.prog_addr, self.prog_sz, self.regs_usage, self.shmem_usage, self.lcmem_usage = self.lib_gpu.va_addr, image.nbytes, 0, 0x400, 0
self.constbufs: dict[int, tuple[int, int]] = {0: (0, 0x160)} # dict[constbuf index, tuple[va_addr, size]]
@@ -256,6 +256,7 @@ class NVProgram(HCQProgram):
# NV's kernargs is constbuffer, then arguments to the kernel follows. Kernargs also appends QMD at the end of the kernel.
super().__init__(NVArgsState, self.dev, self.name, kernargs_alloc_size=round_up(self.constbufs[0][1], 1 << 8) + (8 << 8))
weakref.finalize(self, self._fini, self.dev, self.lib_gpu, buf_spec)
def _parse_elf_info(self, sh, start_off=0):
while start_off < sh.header.sh_size:
@@ -263,9 +264,6 @@ class NVProgram(HCQProgram):
yield typ, param, sh.content[start_off+4:start_off+sz+4] if typ == 0x4 else sz
start_off += (sz if typ == 0x4 else 0) + 4
def __del__(self):
if hasattr(self, 'lib_gpu'): self.dev.allocator.free(self.lib_gpu, self.lib_gpu.size, BufferSpec(cpu_access=True))
def __call__(self, *bufs, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1), vals:tuple[int, ...]=(), wait=False):
if prod(local_size) > 1024 or self.max_threads < prod(local_size) or self.lcmem_usage > cast(NVDevice, self.dev).slm_per_thread:
raise RuntimeError(f"Too many resources requested for launch, {prod(local_size)=}, {self.max_threads=}")

View File

@@ -1,5 +1,5 @@
from __future__ import annotations
import os, ctypes, functools, mmap, struct, array, math, sys
import os, ctypes, functools, mmap, struct, array, math, sys, weakref
assert sys.platform != 'win32'
from types import SimpleNamespace
from typing import Any, cast, ClassVar
@@ -193,7 +193,7 @@ class QCOMProgram(HCQProgram):
self.name, self.lib = name, lib
self._parse_lib()
self.lib_gpu: HCQBuffer = self.dev.allocator.alloc(self.image_size, options=BufferSpec(cpu_access=True, nolru=True))
self.lib_gpu: HCQBuffer = self.dev.allocator.alloc(self.image_size, buf_spec:=BufferSpec(cpu_access=True, nolru=True))
to_mv(cast(int, self.lib_gpu.va_addr), self.image_size)[:] = self.image
self.pvtmem_size_per_item: int = round_up(self.pvtmem, 512) >> 9
@@ -205,6 +205,7 @@ class QCOMProgram(HCQProgram):
kernargs_alloc_size = round_up(2048 + (self.tex_cnt + self.ibo_cnt) * 0x40 + self.samp_cnt * 0x10, 0x100)
super().__init__(QCOMArgsState, self.dev, self.name, kernargs_alloc_size=kernargs_alloc_size)
weakref.finalize(self, self._fini, self.dev, self.lib_gpu, buf_spec)
def __call__(self, *bufs, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1), vals:tuple[int, ...]=(), wait=False):
if self.max_threads < prod(local_size): raise RuntimeError("Too many resources requested for launch")
@@ -262,9 +263,6 @@ class QCOMProgram(HCQProgram):
reg_desc_off = _read_lib(0x34)
self.fregs, self.hregs = _read_lib(reg_desc_off + 0x14), _read_lib(reg_desc_off + 0x18)
def __del__(self):
if hasattr(self, 'lib_gpu'): self.dev.allocator.free(self.lib_gpu, self.lib_gpu.size, options=BufferSpec(cpu_access=True, nolru=True))
class QCOMTextureInfo:
def __init__(self, pitch:int, real_stride:int, desc:list[int], ibo:list[int]):
self.pitch, self.real_stride, self.desc, self.ibo = pitch, real_stride, desc, ibo

View File

@@ -301,6 +301,9 @@ class HCQProgram(Generic[HCQDeviceType]):
self.args_state_t, self.dev, self.name, self.kernargs_alloc_size = args_state_t, dev, name, kernargs_alloc_size
if PROFILE: Compiled.profile_events += [ProfileProgramEvent(dev.device, name, lib, base)]
@staticmethod
def _fini(dev, buf, spec): dev.allocator.free(buf, buf.size, spec)
def fill_kernargs(self, bufs:tuple[HCQBuffer, ...], vals:tuple[int, ...]=(), kernargs:HCQBuffer|None=None) -> HCQArgsState:
"""
Fills arguments for the kernel, optionally allocating space from the device if `kernargs_ptr` is not provided.