mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-06-13 08:28:55 +08:00
hcq: fix progs' __del__ when shutdown (#10341)
* debug ci * better? * and mute this? * revrt that
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
from __future__ import annotations
|
||||
from typing import Any, cast, ClassVar
|
||||
import os, ctypes, ctypes.util, struct, hashlib, functools, importlib, mmap, errno, array, contextlib, sys, select
|
||||
import os, ctypes, ctypes.util, struct, hashlib, functools, importlib, mmap, errno, array, contextlib, sys, select, weakref
|
||||
assert sys.platform != 'win32'
|
||||
from dataclasses import dataclass
|
||||
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQSignal, HCQProgram, FileIOInterface
|
||||
@@ -425,7 +425,7 @@ class AMDProgram(HCQProgram):
|
||||
self.dev, self.name, self.lib = dev, name, lib
|
||||
|
||||
image, sections, _ = elf_loader(self.lib)
|
||||
self.lib_gpu = self.dev.allocator.alloc(round_up(image.nbytes, 0x1000), BufferSpec(cpu_access=True, nolru=True))
|
||||
self.lib_gpu = self.dev.allocator.alloc(round_up(image.nbytes, 0x1000), buf_spec:=BufferSpec(cpu_access=True, nolru=True))
|
||||
self.dev.allocator._copyin(self.lib_gpu, image)
|
||||
self.dev.synchronize()
|
||||
|
||||
@@ -461,9 +461,7 @@ class AMDProgram(HCQProgram):
|
||||
|
||||
super().__init__(CLikeArgsState, self.dev, self.name, kernargs_alloc_size=self.kernargs_segment_size+additional_alloc_sz, lib=self.lib,
|
||||
base=self.lib_gpu.va_addr)
|
||||
|
||||
def __del__(self):
|
||||
if hasattr(self, 'lib_gpu'): self.dev.allocator.free(self.lib_gpu, self.lib_gpu.size, BufferSpec(cpu_access=True, nolru=True))
|
||||
weakref.finalize(self, self._fini, self.dev, self.lib_gpu, buf_spec)
|
||||
|
||||
class AMDAllocator(HCQAllocator['AMDDevice']):
|
||||
def __init__(self, dev:AMDDevice):
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
from __future__ import annotations
|
||||
import os, ctypes, contextlib, re, functools, mmap, struct, array, sys
|
||||
import os, ctypes, contextlib, re, functools, mmap, struct, array, sys, weakref
|
||||
assert sys.platform != 'win32'
|
||||
from typing import Any, cast, Union, Type, ClassVar
|
||||
from dataclasses import dataclass
|
||||
@@ -205,7 +205,7 @@ class NVProgram(HCQProgram):
|
||||
else: image, sections, relocs = elf_loader(self.lib, force_section_align=128)
|
||||
|
||||
# NOTE: Ensure at least 4KB of space after the program to mitigate prefetch memory faults.
|
||||
self.lib_gpu = self.dev.allocator.alloc(round_up(image.nbytes, 0x1000) + 0x1000, BufferSpec(cpu_access=True))
|
||||
self.lib_gpu = self.dev.allocator.alloc(round_up(image.nbytes, 0x1000) + 0x1000, buf_spec:=BufferSpec(cpu_access=True))
|
||||
|
||||
self.prog_addr, self.prog_sz, self.regs_usage, self.shmem_usage, self.lcmem_usage = self.lib_gpu.va_addr, image.nbytes, 0, 0x400, 0
|
||||
self.constbufs: dict[int, tuple[int, int]] = {0: (0, 0x160)} # dict[constbuf index, tuple[va_addr, size]]
|
||||
@@ -256,6 +256,7 @@ class NVProgram(HCQProgram):
|
||||
|
||||
# NV's kernargs is constbuffer, then arguments to the kernel follows. Kernargs also appends QMD at the end of the kernel.
|
||||
super().__init__(NVArgsState, self.dev, self.name, kernargs_alloc_size=round_up(self.constbufs[0][1], 1 << 8) + (8 << 8))
|
||||
weakref.finalize(self, self._fini, self.dev, self.lib_gpu, buf_spec)
|
||||
|
||||
def _parse_elf_info(self, sh, start_off=0):
|
||||
while start_off < sh.header.sh_size:
|
||||
@@ -263,9 +264,6 @@ class NVProgram(HCQProgram):
|
||||
yield typ, param, sh.content[start_off+4:start_off+sz+4] if typ == 0x4 else sz
|
||||
start_off += (sz if typ == 0x4 else 0) + 4
|
||||
|
||||
def __del__(self):
|
||||
if hasattr(self, 'lib_gpu'): self.dev.allocator.free(self.lib_gpu, self.lib_gpu.size, BufferSpec(cpu_access=True))
|
||||
|
||||
def __call__(self, *bufs, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1), vals:tuple[int, ...]=(), wait=False):
|
||||
if prod(local_size) > 1024 or self.max_threads < prod(local_size) or self.lcmem_usage > cast(NVDevice, self.dev).slm_per_thread:
|
||||
raise RuntimeError(f"Too many resources requested for launch, {prod(local_size)=}, {self.max_threads=}")
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
from __future__ import annotations
|
||||
import os, ctypes, functools, mmap, struct, array, math, sys
|
||||
import os, ctypes, functools, mmap, struct, array, math, sys, weakref
|
||||
assert sys.platform != 'win32'
|
||||
from types import SimpleNamespace
|
||||
from typing import Any, cast, ClassVar
|
||||
@@ -193,7 +193,7 @@ class QCOMProgram(HCQProgram):
|
||||
self.name, self.lib = name, lib
|
||||
self._parse_lib()
|
||||
|
||||
self.lib_gpu: HCQBuffer = self.dev.allocator.alloc(self.image_size, options=BufferSpec(cpu_access=True, nolru=True))
|
||||
self.lib_gpu: HCQBuffer = self.dev.allocator.alloc(self.image_size, buf_spec:=BufferSpec(cpu_access=True, nolru=True))
|
||||
to_mv(cast(int, self.lib_gpu.va_addr), self.image_size)[:] = self.image
|
||||
|
||||
self.pvtmem_size_per_item: int = round_up(self.pvtmem, 512) >> 9
|
||||
@@ -205,6 +205,7 @@ class QCOMProgram(HCQProgram):
|
||||
|
||||
kernargs_alloc_size = round_up(2048 + (self.tex_cnt + self.ibo_cnt) * 0x40 + self.samp_cnt * 0x10, 0x100)
|
||||
super().__init__(QCOMArgsState, self.dev, self.name, kernargs_alloc_size=kernargs_alloc_size)
|
||||
weakref.finalize(self, self._fini, self.dev, self.lib_gpu, buf_spec)
|
||||
|
||||
def __call__(self, *bufs, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1), vals:tuple[int, ...]=(), wait=False):
|
||||
if self.max_threads < prod(local_size): raise RuntimeError("Too many resources requested for launch")
|
||||
@@ -262,9 +263,6 @@ class QCOMProgram(HCQProgram):
|
||||
reg_desc_off = _read_lib(0x34)
|
||||
self.fregs, self.hregs = _read_lib(reg_desc_off + 0x14), _read_lib(reg_desc_off + 0x18)
|
||||
|
||||
def __del__(self):
|
||||
if hasattr(self, 'lib_gpu'): self.dev.allocator.free(self.lib_gpu, self.lib_gpu.size, options=BufferSpec(cpu_access=True, nolru=True))
|
||||
|
||||
class QCOMTextureInfo:
|
||||
def __init__(self, pitch:int, real_stride:int, desc:list[int], ibo:list[int]):
|
||||
self.pitch, self.real_stride, self.desc, self.ibo = pitch, real_stride, desc, ibo
|
||||
|
||||
@@ -301,6 +301,9 @@ class HCQProgram(Generic[HCQDeviceType]):
|
||||
self.args_state_t, self.dev, self.name, self.kernargs_alloc_size = args_state_t, dev, name, kernargs_alloc_size
|
||||
if PROFILE: Compiled.profile_events += [ProfileProgramEvent(dev.device, name, lib, base)]
|
||||
|
||||
@staticmethod
|
||||
def _fini(dev, buf, spec): dev.allocator.free(buf, buf.size, spec)
|
||||
|
||||
def fill_kernargs(self, bufs:tuple[HCQBuffer, ...], vals:tuple[int, ...]=(), kernargs:HCQBuffer|None=None) -> HCQArgsState:
|
||||
"""
|
||||
Fills arguments for the kernel, optionally allocating space from the device if `kernargs_ptr` is not provided.
|
||||
|
||||
Reference in New Issue
Block a user