compile QCOM without opening device (#15165)

Co-authored-by: Comma Device <device@comma.ai>
This commit is contained in:
Christopher Milan
2026-03-06 03:24:27 -08:00
committed by GitHub
parent 6fd18ef875
commit 7810be8d3c
9 changed files with 122 additions and 21 deletions

View File

@@ -45,6 +45,10 @@ inputs:
description: "Install mesa"
required: false
default: 'false'
tinydreno:
description: "Install tinydreno"
required: false
default: 'false'
runs:
using: "composite"
steps:
@@ -326,3 +330,9 @@ runs:
if: inputs.mesa == 'true' && runner.os == 'macOS'
shell: bash
run: brew install sirhcm/tinymesa/tinymesa_cpu
# *** tinydreno ***
- name: Install tinydreno (linux)
if: inputs.tinydreno == 'true' && runner.os == 'Linux'
shell: bash
run: sudo curl -fL https://github.com/sirhcm/tinydreno/raw/refs/heads/master/libllvm-qcom.so -o /usr/lib/libllvm-qcom.so

View File

@@ -1011,3 +1011,26 @@ jobs:
python -c "from tinygrad import Device; assert Device.DEFAULT == 'NULL'"
DEBUG=4 python3 test/backend/test_ops.py TestOps.test_add
python -m pytest -n=auto test/backend/test_ops.py --durations=20
qcomclcompiletests:
name: Compile-only (QCOM CL)
runs-on: ubuntu-24.04-arm
timeout-minutes: 15
steps:
- name: Checkout Code
uses: actions/checkout@v4
- name: Setup Environment
uses: ./.github/actions/setup-tinygrad
with:
key: compile-qcomcl
deps: testing_unit
tinydreno: 'true'
python-version: '3.12'
- name: Set env
shell: bash
run: printf "NULL=1\nNULL_ALLOW_COPYOUT=1\nNULL_QCOMCL=1" >> $GITHUB_ENV
- name: Run test_ops
shell: bash
run: |
python -c "from tinygrad import Device; assert Device.DEFAULT == 'NULL'"
DEBUG=4 python3 test/backend/test_ops.py TestOps.test_add
python -m pytest -n=auto test/backend/test_ops.py --durations=20

View File

@@ -6,6 +6,7 @@ from tinygrad.helpers import getenv, IMAGE, DEBUG, CI, Context, CPU_LLVM, AMD_LL
from tinygrad import Tensor, Device, dtypes
from tinygrad.tensor import _to_np_dtype
from tinygrad.device import is_dtype_supported
from tinygrad.renderer.cstyle import QCOMCLRenderer
from tinygrad.renderer.nir import NIRRenderer
TINY_BACKEND = getenv("TINY_BACKEND")
@@ -436,7 +437,7 @@ class TestOps(unittest.TestCase):
helper_test_op([(45,35), (45,35), (45,35)], lambda x,y,z: x.lerp(y,z))
helper_test_op(None, lambda x,y,z: x.lerp(y,z), vals=[[1.,2.,3.], [4.,5.,6.], 0.5])
@unittest.skipIf(Device.DEFAULT == "QCOM", "OpenCL fails to compile this (both on GPU(qcom)/QCOM backends)")
@unittest.skipIf(isinstance(Device[Device.DEFAULT].renderer, QCOMCLRenderer), "QCOM CL vectorized bool bug")
def test_tril(self):
helper_test_op([(3,3)], lambda x: x.tril())
helper_test_op([(3,3)], lambda x: x.tril(1))
@@ -454,7 +455,7 @@ class TestOps(unittest.TestCase):
helper_test_op([(5,3,3)], lambda x: x.tril(1))
helper_test_op(None, lambda x: x.tril(), vals=[[[True] * 3] * 3], forward_only=True)
@unittest.skipIf(Device.DEFAULT == "QCOM", "OpenCL fails to compile this (both on GPU(qcom)/QCOM backends)")
@unittest.skipIf(isinstance(Device[Device.DEFAULT].renderer, QCOMCLRenderer), "QCOM CL vectorized bool bug")
def test_triu(self):
helper_test_op([(3,3)], lambda x: x.triu())
helper_test_op([(3,3)], lambda x: x.triu(1))
@@ -765,6 +766,7 @@ class TestOps(unittest.TestCase):
self.helper_test_exception([(4), (4)], lambda x,y: x.bitwise_xor(y), expected=RuntimeError)
@unittest.skipIf(isinstance(Device[Device.DEFAULT].renderer, QCOMCLRenderer), "QCOM CL vectorized bool bug")
def test_and(self):
data = [[1,-8,1],[32,1,6]]
tor = torch.tensor(data, dtype=torch.int)
@@ -782,6 +784,7 @@ class TestOps(unittest.TestCase):
self.helper_test_exception([(4), (4)], lambda x,y: x.bitwise_and(y), expected=RuntimeError)
@unittest.skipIf(isinstance(Device[Device.DEFAULT].renderer, QCOMCLRenderer), "QCOM CL vectorized bool bug")
def test_or(self):
data = [[1,-8,1],[32,1,6]]
tor = torch.tensor(data, dtype=torch.int)
@@ -1170,6 +1173,7 @@ class TestOps(unittest.TestCase):
helper_test_op(None, lambda x: x.type(torch.int32).argmax().type(torch.int32), lambda x: x.argmax(), forward_only=True, vals=[[False, True]])
helper_test_op(None, lambda x: x.type(torch.int32).argmax().type(torch.int32), lambda x: x.argmax(), forward_only=True, vals=[[True, False]])
@unittest.skipIf(isinstance(Device[Device.DEFAULT].renderer, QCOMCLRenderer), "QCOM CL vectorized bool bug")
def test_argmin(self):
# check if it returns the first index for multiple occurrences
helper_test_op(None, lambda x: x.argmin().type(torch.int32), lambda x: x.argmin(), forward_only=True, vals=[[2, 2]])
@@ -1475,6 +1479,7 @@ class TestOps(unittest.TestCase):
def test_prod_dtype_arg(self):
with self.assertRaises(AttributeError): Tensor([1.0, 2.0]).prod(dtype="")
@unittest.skipIf(isinstance(Device[Device.DEFAULT].renderer, QCOMCLRenderer), "QCOM CL vectorized bool bug")
def test_min(self):
helper_test_op([(3,3)], lambda x: x.min())
helper_test_op([(45,3)], lambda x: x.min())
@@ -1503,7 +1508,6 @@ class TestOps(unittest.TestCase):
helper_test_op([(3,3)], lambda x: torch.full_like(x, 2).prod(), lambda x: (x.full_like(2)).prod(), forward_only=True)
helper_test_op([(3,3)], lambda x: torch.full_like(x, 2).max(), lambda x: (x.full_like(2)).max(), forward_only=True)
@unittest.skipIf(Device.DEFAULT == "QCOM", "OpenCL fails to compile this (both on GPU(qcom)/QCOM backends)")
def test_any(self):
helper_test_op([(3,4,5,6)], lambda x: x.any(), forward_only=True)
helper_test_op(None, lambda x: x.any(), vals=[[True, True]], forward_only=True)
@@ -1515,7 +1519,7 @@ class TestOps(unittest.TestCase):
def test_any_zero_axis(self):
helper_test_op([(1,0,3,0,5)], lambda x: x.any(axis=(1,3)), forward_only=True)
@unittest.skipIf(Device.DEFAULT == "QCOM", "OpenCL fails to compile this (both on GPU(qcom)/QCOM backends)")
@unittest.skipIf(isinstance(Device[Device.DEFAULT].renderer, QCOMCLRenderer), "QCOM CL vectorized bool bug")
def test_all(self):
helper_test_op([(3,4,5,6)], lambda x: x.all(), forward_only=True)
helper_test_op(None, lambda x: x.all(), vals=[[True, True]], forward_only=True)
@@ -2889,6 +2893,7 @@ class TestOps(unittest.TestCase):
helper_test_op([(2,5,6,5,3,4)], lambda x: x[...,c,:,e], lambda x: x[...,k,:,p])
@slow_test
@unittest.skipIf(isinstance(Device[Device.DEFAULT].renderer, QCOMCLRenderer), "QCOM CL vectorized bool bug")
def test_slice_fancy_indexing_dim_collapse_int(self):
a,b,c,d,e,i,j,k,o,p = self._get_index_randoms()
# dim collapse from int
@@ -2899,6 +2904,7 @@ class TestOps(unittest.TestCase):
helper_test_op([(2,5,6,5,3,4)], lambda x: x[1,:,3:11:2,d,0:2], lambda x: x[1,:,3:11:2,o,0:2])
@slow_test
@unittest.skipIf(isinstance(Device[Device.DEFAULT].renderer, QCOMCLRenderer), "QCOM CL vectorized bool bug")
def test_slice_fancy_indexing_dim_inject_none(self):
a,b,c,d,e,i,j,k,o,p = self._get_index_randoms()
# dim injection from None
@@ -2933,6 +2939,7 @@ class TestOps(unittest.TestCase):
lambda x: x[Tensor([[0,1,-1],[-1,-2,0]]), Tensor([2,1,-1])])
@slow_test
@unittest.skipIf(isinstance(Device[Device.DEFAULT].renderer, QCOMCLRenderer), "QCOM CL vectorized bool bug")
def test_slice_fancy_indexing_list_indices(self):
a,b,c,d,e,i,j,k,o,p = self._get_index_randoms()
helper_test_op([(2,5,6,5,3,4)], lambda x: x[((0,),)])
@@ -2944,6 +2951,7 @@ class TestOps(unittest.TestCase):
helper_test_op([(2,5,6,5,3,4)], lambda x: x[a,(2,1,0),c,(-2,1,0),e], lambda x: x[i,(2,1,0),k,(-2,1,0),p])
@slow_test
@unittest.skipIf(isinstance(Device[Device.DEFAULT].renderer, QCOMCLRenderer), "QCOM CL vectorized bool bug")
def test_slice_fancy_indexing_tuple_indices(self):
a,b,c,d,e,i,j,k,o,p = self._get_index_randoms()
helper_test_op([(2,5,6,5,3,4)], lambda x: x[(((0,),),)], lambda x: x[(((0,),),)])
@@ -3285,7 +3293,6 @@ class TestOps(unittest.TestCase):
helper_test_op([(20,)], lambda x: (x>0.5).nonzero().int(), lambda x: (x>0.5).nonzero(), forward_only=True)
helper_test_op([(10, 5, 3)], lambda x: (x>0.5).nonzero().int(), lambda x: (x>0.5).nonzero(), forward_only=True)
@unittest.skipIf(Device.DEFAULT == "QCOM", "OpenCL fails to compile this (both on GPU(qcom)/QCOM backends)")
def test_cast(self):
helper_test_op([(3, 3)], lambda x: x.float())
helper_test_op(None, lambda x: x.float(), vals=[[0, 1, 2, 3]], forward_only=True)

View File

@@ -6,7 +6,7 @@ import importlib, inspect, functools, pathlib, os, platform, contextlib, sys, re
from tinygrad.helpers import CI, OSX, LRU, getenv, diskcache_get, diskcache_put, DEBUG, GlobalCounters, flat_mv, PROFILE, temp, colored
from tinygrad.helpers import Context, CCACHE, ALLOW_DEVICE_USAGE, MAX_BUFFER_SIZE, cpu_events, ProfileEvent, ProfilePointEvent, dedup, ContextVar
from tinygrad.helpers import unwrap_class_type, suppress_finalizing, select_first_inited, VIZ, CPU_LLVM, CPU_LVP, NV_PTX, CUDA_PTX, NV_NAK
from tinygrad.helpers import EMULATED_DTYPES, TracingKey
from tinygrad.helpers import EMULATED_DTYPES, NULL_IR3, NULL_QCOMCL, TracingKey
from tinygrad.dtype import DType, ImageDType, PtrDType, dtypes, _to_np_dtype
if TYPE_CHECKING: from tinygrad.renderer import Renderer
@@ -371,7 +371,7 @@ def is_dtype_supported(dtype:DType, device:str|None=None) -> bool:
if device in ["CUDA", "NV"]: return not CI
if device == "CPU" and CPU_LLVM: return OSX
if device == "PYTHON": return sys.version_info >= (3, 12)
if dtype == dtypes.float64: return (device not in {"METAL", "QCOM"} and not (OSX and device == "CL") and not getenv("NULL_IR3")
if dtype == dtypes.float64: return (device not in {"METAL", "QCOM"} and not (OSX and device == "CL") and not NULL_IR3 and not NULL_QCOMCL
and dtypes.long not in EMULATED_DTYPES.tolist(dtypes))
return True

View File

@@ -195,7 +195,8 @@ CPU_COUNT = ContextVar("CPU_COUNT", max(1, len(os.sched_getaffinity(0)) if hasat
CPU_CC, CPU_LLVM, CPU_LVP = ContextVar("CPU_CC", ""), ContextVar("CPU_LLVM", 0), ContextVar("CPU_LVP", 0)
NV_CC, NV_PTX, NV_NAK, NV_NVCC = ContextVar("NV_CC", ""), ContextVar("NV_PTX", 0), ContextVar("NV_NAK", 0), ContextVar("NV_NVCC", 0)
CUDA_CC, CUDA_PTX, CUDA_NVCC = ContextVar("CUDA_CC", ""), ContextVar("CUDA_PTX", 0), ContextVar("CUDA_NVCC", 0)
NULL_IR3, NULL_NAK, NULL_ALLOW_COPYOUT = ContextVar("NULL_IR3", 0), ContextVar("NULL_NAK", 0), ContextVar("NULL_ALLOW_COPYOUT", 0)
NULL_QCOMCL, NULL_IR3, NULL_NAK = ContextVar("NULL_QCOMCL", 0), ContextVar("NULL_IR3", 0), ContextVar("NULL_NAK", 0)
NULL_ALLOW_COPYOUT = ContextVar("NULL_ALLOW_COPYOUT", 0)
AMD_CC, AMD_LLVM, AMD_HIPCC = ContextVar("AMD_CC", ""), ContextVar("AMD_LLVM", 0), ContextVar("AMD_HIPCC", 0)
QCOM_CC, QCOM_IR3 = ContextVar("QCOM_CC", ""), ContextVar("QCOM_IR3", 0)
# VIZ implies PROFILE, but you can run PROFILE without VIZ

View File

@@ -566,4 +566,9 @@ class AMDHIPCCRenderer(AMDHIPRenderer):
super().__init__(arch)
self.compiler = HIPCCCompiler(arch)
class QCOMRenderer(OpenCLRenderer): device = "QCOM"
class QCOMCLRenderer(OpenCLRenderer):
device = "QCOM"
def __init__(self, chip_id):
from tinygrad.runtime.support.compiler_qcom import QCOMCompiler
self.compiler = QCOMCompiler(chip_id)

View File

@@ -1,9 +1,9 @@
import functools
from tinygrad.device import Compiled, Allocator, CompilerSet
from tinygrad.engine.jit import MultiGraphRunner
from tinygrad.renderer.cstyle import Renderer, CStyleLanguage, AMDHIPRenderer
from tinygrad.renderer.cstyle import Renderer, CStyleLanguage, AMDHIPRenderer, QCOMCLRenderer
from tinygrad.uop.ops import Ops
from tinygrad.helpers import cpu_profile, EMULATE, NULL_IR3, NULL_NAK, NULL_ALLOW_COPYOUT
from tinygrad.helpers import cpu_profile, EMULATE, NULL_QCOMCL, NULL_IR3, NULL_NAK, NULL_ALLOW_COPYOUT
from tinygrad.renderer.nir import IR3Renderer, NAKRenderer
class NullRenderer(CStyleLanguage):
@@ -39,6 +39,7 @@ class NullDevice(Compiled):
case "AMD_CDNA4": renderer = functools.partial(AMDHIPRenderer, "gfx950")
case "": renderer = NullRenderer
case _: raise RuntimeError(f"can't EMULATE device: {EMULATE.value}")
compilers = CompilerSet([(renderer, None), (functools.partial(IR3Renderer, 0x6030001), NULL_IR3), # adreno 630
compilers = CompilerSet([(renderer, None), (functools.partial(QCOMCLRenderer, 0x6030001), NULL_QCOMCL), # adreno 630
(functools.partial(IR3Renderer, 0x6030001), NULL_IR3), # adreno 630
(functools.partial(NAKRenderer, "sm_120", 48), NULL_NAK)]) # 5090
super().__init__(device, NullAllocator(self), compilers, functools.partial(NullProgram, device), NullGraph)

View File

@@ -6,11 +6,10 @@ from tinygrad.device import BufferSpec, CompilerSet, Device
from tinygrad.runtime.support.hcq import HCQBuffer, HWQueue, HCQProgram, HCQCompiled, HCQAllocatorBase, HCQSignal, HCQArgsState, BumpAllocator
from tinygrad.runtime.support.hcq import FileIOInterface, MMIOInterface
from tinygrad.runtime.autogen import kgsl, mesa
from tinygrad.runtime.ops_cl import CLDevice
from tinygrad.renderer.cstyle import QCOMRenderer
from tinygrad.renderer.cstyle import QCOMCLRenderer
from tinygrad.renderer.nir import IR3Renderer
from tinygrad.helpers import getenv, mv_address, to_mv, round_up, data64_le, ceildiv, prod, fromimport, cpu_profile, lo32, suppress_finalizing
from tinygrad.helpers import next_power2, flatten, QCOM_IR3, QCOM_CC, PROFILE, DEBUG
from tinygrad.helpers import getenv, mv_address, to_mv, round_up, data64_le, ceildiv, prod, cpu_profile, lo32, suppress_finalizing
from tinygrad.helpers import next_power2, flatten, QCOM_IR3, QCOM_CC, PROFILE
from tinygrad.dtype import ImageDType, dtypes
from tinygrad.runtime.support.system import System
if getenv("IOCTL"): import extra.qcom_gpu_driver.opencl_ioctl # noqa: F401 # pylint: disable=unused-import
@@ -248,9 +247,7 @@ class QCOMProgram(HCQProgram):
self.tex_off, self.ibo_off, self.samp_off = 2048, 2048 + 0x40 * self.tex_cnt, 2048 + 0x40 * (self.tex_cnt + self.ibo_cnt)
self.fregs, self.hregs = v.info.max_reg + 1, v.info.max_half_reg + 1
self.consts_info:list[tuple] = []
else:
self._parse_lib(lib:=self.dev.cl_dev.cl_compiler.compile_cached(lib.decode()))
if DEBUG >= 7: fromimport('tinygrad.runtime.support.compiler_mesa', 'disas_adreno')(lib[(ofs:=_read_lib(lib, 0xc0)):ofs+_read_lib(lib, 0x100)])
else: self._parse_lib(lib)
self.lib_gpu: HCQBuffer = self.dev.allocator.alloc(self.image_size, buf_spec:=BufferSpec(cpu_access=True, nolru=True))
to_mv(self.lib_gpu.va_addr, self.image_size)[:] = self.image
@@ -384,8 +381,8 @@ class QCOMDevice(HCQCompiled):
if PROFILE and self.gpu_id[:2] < (7, 3):
System.write_sysfs("/sys/class/kgsl/kgsl-3d0/idle_timer", value="4000000000", msg="Failed to disable suspend mode", expected="4294967276")
self.cl_dev = CLDevice(device)
compilers = CompilerSet(ctrl_var=QCOM_CC, cset=[(QCOMRenderer, None), (functools.partial(IR3Renderer, info.chip_id), QCOM_IR3)])
compilers = CompilerSet(ctrl_var=QCOM_CC, cset=[(functools.partial(QCOMCLRenderer, info.chip_id), None),
(functools.partial(IR3Renderer, info.chip_id), QCOM_IR3)])
super().__init__(device, QCOMAllocator(self), compilers, functools.partial(QCOMProgram, self), QCOMSignal,
functools.partial(QCOMComputeQueue, self), None)

View File

@@ -0,0 +1,57 @@
import ctypes, struct
from tinygrad.device import Compiler
from tinygrad.runtime.support.c import DLL
from tinygrad.runtime.support.compiler_mesa import disas_adreno
# see https://github.com/sirhcm/tinydreno
dll = DLL("llvm-qcom", ["llvm-qcom"])
(create_llvm_instance:=dll.cl_compiler_create_llvm_instance).restype, create_llvm_instance.argtypes = ctypes.c_void_p, []
(compile_source:=dll.cl_compiler_compile_source).restype = ctypes.c_void_p
compile_source.argtypes = [ctypes.c_void_p, ctypes.c_uint64, ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_uint64, ctypes.c_uint64,
ctypes.c_char_p, ctypes.c_uint64, ctypes.c_uint64, ctypes.c_void_p]
(link_program:=dll.cl_compiler_link_program).restype = ctypes.c_void_p
link_program.argtypes = [ctypes.c_void_p, ctypes.c_uint64, ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_void_p]
(get_error_code:=dll.cl_compiler_get_error_code).restype, get_error_code.argtypes = ctypes.c_int, [ctypes.c_void_p]
(get_build_log:=dll.cl_compiler_get_build_log).restype, get_build_log.argtypes = ctypes.c_char_p, [ctypes.c_void_p]
(handle_create_binary:=dll.cl_compiler_handle_create_binary).restype = None
handle_create_binary.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_void_p), ctypes.POINTER(ctypes.c_size_t)]
(free_handle:=dll.cl_compiler_free_handle).restype, free_handle.argtypes = None, [ctypes.c_void_p]
(free_assembly:=dll.cl_compiler_free_assembly).restype, free_assembly.argtypes = None, [ctypes.c_void_p]
(destroy_llvm_instance:=dll.cl_compiler_destroy_llvm_instance).restype, destroy_llvm_instance.argtypes = None, [ctypes.c_void_p]
MODE_32BIT, MODE_64BIT, SRC_STR, SRC_BLOB = 0, 1, 0, 1
def _read_lib(lib, off) -> int: return struct.unpack("I", lib[off:off+4])[0]
class QCOMCompiler(Compiler):
def __init__(self, chip_id):
self.chip_id, self.llvm_inst = chip_id, create_llvm_instance()
super().__init__(f"compile_qcomcl_{chip_id}")
def __del__(self): destroy_llvm_instance(self.llvm_inst)
def __reduce__(self): return QCOMCompiler, (self.chip_id,)
def checked(self, handle):
if handle is None or get_error_code(handle) != 0:
destroy_llvm_instance(self.llvm_inst)
self.llvm_inst = create_llvm_instance()
raise RuntimeError("QCOM Compilation Error" + ("" if handle is None else f": {get_build_log(handle)}"))
return handle
def compile(self, src) -> bytes:
ch = self.checked(compile_source(self.llvm_inst, self.chip_id, MODE_64BIT, b"", 0, 0, 0, src.encode(), 0, SRC_STR, None))
lh = self.checked(link_program(self.llvm_inst, self.chip_id, MODE_64BIT, None, 1, ctypes.pointer(ctypes.c_void_p(ch))))
handle_create_binary(lh, ctypes.byref(ptr:=ctypes.c_void_p()), ctypes.byref(sz:=ctypes.c_size_t()))
for h in [ch, lh]: free_handle(h)
ret = ctypes.string_at(ptr, sz.value)
free_assembly(ptr)
return ret
def disassemble(self, lib: bytes): disas_adreno(lib[(ofs:=_read_lib(lib, 0xc0)):ofs+_read_lib(lib, 0x100)], self.chip_id)