From 7ecced7f6d9c103f0aa32d74dbf146e4eac47f9d Mon Sep 17 00:00:00 2001 From: uuuvn <83587632+uuuvn@users.noreply.github.com> Date: Wed, 15 Jan 2025 19:47:08 +0200 Subject: [PATCH] LLVM JIT prereqs (#8634) * LLVM JIT prereqs This commit moves jit loading, disassembling and CPUProgram logic from `ops_clang.py` to `elf.py`, `helpers.py` and `device.py` respectively I don't quite like the `helpers.py` destination for capstone_flatdump but this is where cpu_objdump is so presumably this is how it's supposed to be * Types --- tinygrad/device.py | 40 +++++++++++++++++++++-- tinygrad/helpers.py | 9 +++++ tinygrad/runtime/ops_clang.py | 58 ++++----------------------------- tinygrad/runtime/support/elf.py | 9 ++++- 4 files changed, 61 insertions(+), 55 deletions(-) diff --git a/tinygrad/device.py b/tinygrad/device.py index a323d06aed..530770f0eb 100644 --- a/tinygrad/device.py +++ b/tinygrad/device.py @@ -2,8 +2,10 @@ from __future__ import annotations from dataclasses import dataclass, replace from collections import defaultdict from typing import Optional, Any, Iterator, Generator -import multiprocessing, importlib, inspect, functools, pathlib, os, ctypes, contextlib, sys, re, atexit, pickle, decimal, time -from tinygrad.helpers import CI, OSX, getenv, diskcache_get, diskcache_put, DEBUG, GlobalCounters, flat_mv, from_mv, PROFILE, temp +import multiprocessing, importlib, inspect, functools, pathlib, os, ctypes, ctypes.util, platform, contextlib, sys, re, atexit, pickle, decimal, time +from mmap import mmap, PROT_READ, PROT_WRITE, PROT_EXEC, MAP_ANON, MAP_PRIVATE +from tinygrad.helpers import CI, OSX, getenv, diskcache_get, diskcache_put, DEBUG, GlobalCounters, flat_mv, from_mv, PROFILE, temp, mv_address, \ + cpu_time_execution from tinygrad.dtype import DType, ImageDType, PtrDType, dtypes from tinygrad.renderer import Renderer @@ -213,6 +215,40 @@ class _MallocAllocator(LRUAllocator): MallocAllocator = _MallocAllocator() +# NOTE: MAP_JIT is added to mmap module in python 3.13 +MAP_JIT = 0x0800 + +# CPUProgram is a jit/shellcode program that can be just mmapped and jumped to +class CPUProgram: + helper_handle = ctypes.CDLL(ctypes.util.find_library('System' if OSX else 'gcc_s')) + + def __init__(self, name:str, lib:bytes): + # On apple silicon with SPRR enabled (it always is in macos) RWX pages are unrepresentable: https://blog.svenpeter.dev/posts/m1_sprr_gxf/ + # MAP_JIT allows us to easily flip pages from RW- to R-X and vice versa. It is a noop on intel cpus. (man pthread_jit_write_protect_np) + self.mem = mmap(-1, len(lib), MAP_ANON | MAP_PRIVATE | (MAP_JIT if OSX else 0), PROT_READ | PROT_WRITE | PROT_EXEC) + + if OSX: CPUProgram.helper_handle.pthread_jit_write_protect_np(False) + self.mem.write(lib) + if OSX: CPUProgram.helper_handle.pthread_jit_write_protect_np(True) + + # __clear_cache isn't a normal libc function, but a compiler support routine found in libgcc_s for gcc and compiler-rt for clang. + # libgcc_s comes as shared library but compiler-rt is only a bunch of static library archives which we can't directly load, but fortunately + # it somehow found its way into libSystem on macos (likely because it used __builtin_clear_cache) and libgcc_s is ~always present on linux + # Using ["name"] instead of .name because otherwise name is getting mangled: https://docs.python.org/3.12/reference/expressions.html#index-5 + CPUProgram.helper_handle["__clear_cache"](ctypes.c_void_p(mv_address(self.mem)), ctypes.c_void_p(mv_address(self.mem) + len(lib))) + + self.fxn = ctypes.CFUNCTYPE(None)(mv_address(self.mem)) + + def __call__(self, *bufs, vals=(), wait=False): + args = list(bufs) + list(vals) + # NOTE: replace this by --target={host's triple}-elf in clang args once we only support macos sequoia and later. + # Apple relaxes abi requirement for stack arguments to always be at least 8 byte aligned on arm64 + # https://developer.apple.com/documentation/xcode/writing-arm64-code-for-apple-platforms + # This hack is required because clang/llvm bug doesn't allow us to just use {host's triple}+'-elf' (relocation failures) + # The bug was fixed in https://github.com/llvm/llvm-project/commit/454cc36630296262cdb6360b60f90a64a97f7f1a but was only backported to xcode 16+ + if platform.machine() == "arm64" and OSX: args = args[:8] + [ctypes.c_int64(a) if isinstance(a, int) else a for a in args[8:]] + return cpu_time_execution(lambda: self.fxn(*args), enable=wait) + # **************** for Compiled Devices **************** class CompileError(Exception): pass diff --git a/tinygrad/helpers.py b/tinygrad/helpers.py index e394557d96..4346255983 100644 --- a/tinygrad/helpers.py +++ b/tinygrad/helpers.py @@ -267,6 +267,15 @@ def cpu_objdump(lib, objdump_tool='objdump'): pathlib.Path(f.name).write_bytes(lib) print(subprocess.check_output([objdump_tool, '-d', f.name]).decode('utf-8')) +def capstone_flatdump(lib: bytes): + import capstone + match platform.machine(): + case 'x86_64': cs = capstone.Cs(capstone.CS_ARCH_X86, capstone.CS_MODE_64) + case 'aarch64' | 'arm64': cs = capstone.Cs(capstone.CS_ARCH_ARM64, capstone.CS_MODE_ARM) + case machine: raise NotImplementedError(f"Capstone disassembly isn't supported for {machine}") + for instr in cs.disasm(lib, 0): + print(f"{instr.address:#08x}: {instr.mnemonic}\t{instr.op_str}") + # *** ctypes helpers # TODO: make this work with read only memoryviews (if possible) diff --git a/tinygrad/runtime/ops_clang.py b/tinygrad/runtime/ops_clang.py index 56f080681b..689c04c460 100644 --- a/tinygrad/runtime/ops_clang.py +++ b/tinygrad/runtime/ops_clang.py @@ -1,13 +1,9 @@ -import ctypes, ctypes.util, struct, platform, tempfile, pathlib, subprocess -from mmap import mmap, PROT_READ, PROT_WRITE, PROT_EXEC, MAP_ANON, MAP_PRIVATE -from tinygrad.helpers import OSX, mv_address, cpu_time_execution, cpu_objdump -from tinygrad.device import Compiled, Compiler, MallocAllocator -from tinygrad.runtime.support.elf import elf_loader, relocate +import platform, tempfile, pathlib, subprocess +from tinygrad.helpers import cpu_objdump, capstone_flatdump +from tinygrad.device import Compiled, Compiler, MallocAllocator, CPUProgram +from tinygrad.runtime.support.elf import jit_loader from tinygrad.renderer.cstyle import ClangRenderer -# NOTE: MAP_JIT is added to mmap module in python 3.13 -MAP_JIT = 0x0800 - # Used by ops_dsp.py class ClangCompiler(Compiler): def __init__(self, cachekey="compile_clang", args:list[str]|None=None, objdump_tool='objdump'): @@ -33,51 +29,9 @@ class ClangJITCompiler(Compiler): args = ['-march=native', f'--target={platform.machine()}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib'] arch_args = ['-ffixed-x18'] if platform.machine() == 'arm64' else [] obj = subprocess.check_output(['clang', '-c', '-x', 'c', *args, *arch_args, '-', '-o', '-'], input=src.encode('utf-8')) - image, _, relocs = elf_loader(obj) - # This is needed because we have an object file, not a .so that has all internal references (like loads of constants from .rodata) resolved. - for ploc,tgt,r_type,r_addend in relocs: - image[ploc:ploc+4] = struct.pack(" bytes: + image, _, relocs = elf_loader(obj) + # This is needed because we have an object file, not a .so that has all internal references (like loads of constants from .rodata) resolved. + for ploc,tgt,r_type,r_addend in relocs: + image[ploc:ploc+4] = struct.pack("