mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-06-13 00:15:35 +08:00
LLVM JIT prereqs (#8634)
* LLVM JIT prereqs This commit moves jit loading, disassembling and CPUProgram logic from `ops_clang.py` to `elf.py`, `helpers.py` and `device.py` respectively I don't quite like the `helpers.py` destination for capstone_flatdump but this is where cpu_objdump is so presumably this is how it's supposed to be * Types
This commit is contained in:
@@ -2,8 +2,10 @@ from __future__ import annotations
|
||||
from dataclasses import dataclass, replace
|
||||
from collections import defaultdict
|
||||
from typing import Optional, Any, Iterator, Generator
|
||||
import multiprocessing, importlib, inspect, functools, pathlib, os, ctypes, contextlib, sys, re, atexit, pickle, decimal, time
|
||||
from tinygrad.helpers import CI, OSX, getenv, diskcache_get, diskcache_put, DEBUG, GlobalCounters, flat_mv, from_mv, PROFILE, temp
|
||||
import multiprocessing, importlib, inspect, functools, pathlib, os, ctypes, ctypes.util, platform, contextlib, sys, re, atexit, pickle, decimal, time
|
||||
from mmap import mmap, PROT_READ, PROT_WRITE, PROT_EXEC, MAP_ANON, MAP_PRIVATE
|
||||
from tinygrad.helpers import CI, OSX, getenv, diskcache_get, diskcache_put, DEBUG, GlobalCounters, flat_mv, from_mv, PROFILE, temp, mv_address, \
|
||||
cpu_time_execution
|
||||
from tinygrad.dtype import DType, ImageDType, PtrDType, dtypes
|
||||
from tinygrad.renderer import Renderer
|
||||
|
||||
@@ -213,6 +215,40 @@ class _MallocAllocator(LRUAllocator):
|
||||
|
||||
MallocAllocator = _MallocAllocator()
|
||||
|
||||
# NOTE: MAP_JIT is added to mmap module in python 3.13
|
||||
MAP_JIT = 0x0800
|
||||
|
||||
# CPUProgram is a jit/shellcode program that can be just mmapped and jumped to
|
||||
class CPUProgram:
|
||||
helper_handle = ctypes.CDLL(ctypes.util.find_library('System' if OSX else 'gcc_s'))
|
||||
|
||||
def __init__(self, name:str, lib:bytes):
|
||||
# On apple silicon with SPRR enabled (it always is in macos) RWX pages are unrepresentable: https://blog.svenpeter.dev/posts/m1_sprr_gxf/
|
||||
# MAP_JIT allows us to easily flip pages from RW- to R-X and vice versa. It is a noop on intel cpus. (man pthread_jit_write_protect_np)
|
||||
self.mem = mmap(-1, len(lib), MAP_ANON | MAP_PRIVATE | (MAP_JIT if OSX else 0), PROT_READ | PROT_WRITE | PROT_EXEC)
|
||||
|
||||
if OSX: CPUProgram.helper_handle.pthread_jit_write_protect_np(False)
|
||||
self.mem.write(lib)
|
||||
if OSX: CPUProgram.helper_handle.pthread_jit_write_protect_np(True)
|
||||
|
||||
# __clear_cache isn't a normal libc function, but a compiler support routine found in libgcc_s for gcc and compiler-rt for clang.
|
||||
# libgcc_s comes as shared library but compiler-rt is only a bunch of static library archives which we can't directly load, but fortunately
|
||||
# it somehow found its way into libSystem on macos (likely because it used __builtin_clear_cache) and libgcc_s is ~always present on linux
|
||||
# Using ["name"] instead of .name because otherwise name is getting mangled: https://docs.python.org/3.12/reference/expressions.html#index-5
|
||||
CPUProgram.helper_handle["__clear_cache"](ctypes.c_void_p(mv_address(self.mem)), ctypes.c_void_p(mv_address(self.mem) + len(lib)))
|
||||
|
||||
self.fxn = ctypes.CFUNCTYPE(None)(mv_address(self.mem))
|
||||
|
||||
def __call__(self, *bufs, vals=(), wait=False):
|
||||
args = list(bufs) + list(vals)
|
||||
# NOTE: replace this by --target={host's triple}-elf in clang args once we only support macos sequoia and later.
|
||||
# Apple relaxes abi requirement for stack arguments to always be at least 8 byte aligned on arm64
|
||||
# https://developer.apple.com/documentation/xcode/writing-arm64-code-for-apple-platforms
|
||||
# This hack is required because clang/llvm bug doesn't allow us to just use {host's triple}+'-elf' (relocation failures)
|
||||
# The bug was fixed in https://github.com/llvm/llvm-project/commit/454cc36630296262cdb6360b60f90a64a97f7f1a but was only backported to xcode 16+
|
||||
if platform.machine() == "arm64" and OSX: args = args[:8] + [ctypes.c_int64(a) if isinstance(a, int) else a for a in args[8:]]
|
||||
return cpu_time_execution(lambda: self.fxn(*args), enable=wait)
|
||||
|
||||
# **************** for Compiled Devices ****************
|
||||
|
||||
class CompileError(Exception): pass
|
||||
|
||||
@@ -267,6 +267,15 @@ def cpu_objdump(lib, objdump_tool='objdump'):
|
||||
pathlib.Path(f.name).write_bytes(lib)
|
||||
print(subprocess.check_output([objdump_tool, '-d', f.name]).decode('utf-8'))
|
||||
|
||||
def capstone_flatdump(lib: bytes):
|
||||
import capstone
|
||||
match platform.machine():
|
||||
case 'x86_64': cs = capstone.Cs(capstone.CS_ARCH_X86, capstone.CS_MODE_64)
|
||||
case 'aarch64' | 'arm64': cs = capstone.Cs(capstone.CS_ARCH_ARM64, capstone.CS_MODE_ARM)
|
||||
case machine: raise NotImplementedError(f"Capstone disassembly isn't supported for {machine}")
|
||||
for instr in cs.disasm(lib, 0):
|
||||
print(f"{instr.address:#08x}: {instr.mnemonic}\t{instr.op_str}")
|
||||
|
||||
# *** ctypes helpers
|
||||
|
||||
# TODO: make this work with read only memoryviews (if possible)
|
||||
|
||||
@@ -1,13 +1,9 @@
|
||||
import ctypes, ctypes.util, struct, platform, tempfile, pathlib, subprocess
|
||||
from mmap import mmap, PROT_READ, PROT_WRITE, PROT_EXEC, MAP_ANON, MAP_PRIVATE
|
||||
from tinygrad.helpers import OSX, mv_address, cpu_time_execution, cpu_objdump
|
||||
from tinygrad.device import Compiled, Compiler, MallocAllocator
|
||||
from tinygrad.runtime.support.elf import elf_loader, relocate
|
||||
import platform, tempfile, pathlib, subprocess
|
||||
from tinygrad.helpers import cpu_objdump, capstone_flatdump
|
||||
from tinygrad.device import Compiled, Compiler, MallocAllocator, CPUProgram
|
||||
from tinygrad.runtime.support.elf import jit_loader
|
||||
from tinygrad.renderer.cstyle import ClangRenderer
|
||||
|
||||
# NOTE: MAP_JIT is added to mmap module in python 3.13
|
||||
MAP_JIT = 0x0800
|
||||
|
||||
# Used by ops_dsp.py
|
||||
class ClangCompiler(Compiler):
|
||||
def __init__(self, cachekey="compile_clang", args:list[str]|None=None, objdump_tool='objdump'):
|
||||
@@ -33,51 +29,9 @@ class ClangJITCompiler(Compiler):
|
||||
args = ['-march=native', f'--target={platform.machine()}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib']
|
||||
arch_args = ['-ffixed-x18'] if platform.machine() == 'arm64' else []
|
||||
obj = subprocess.check_output(['clang', '-c', '-x', 'c', *args, *arch_args, '-', '-o', '-'], input=src.encode('utf-8'))
|
||||
image, _, relocs = elf_loader(obj)
|
||||
# This is needed because we have an object file, not a .so that has all internal references (like loads of constants from .rodata) resolved.
|
||||
for ploc,tgt,r_type,r_addend in relocs:
|
||||
image[ploc:ploc+4] = struct.pack("<I", relocate(struct.unpack("<I", image[ploc:ploc+4])[0], ploc, tgt+r_addend, r_type))
|
||||
return bytes(image)
|
||||
return jit_loader(obj)
|
||||
|
||||
def disassemble(self, lib):
|
||||
import capstone
|
||||
match platform.machine():
|
||||
case 'x86_64': cs = capstone.Cs(capstone.CS_ARCH_X86, capstone.CS_MODE_64)
|
||||
case 'aarch64' | 'arm64': cs = capstone.Cs(capstone.CS_ARCH_ARM64, capstone.CS_MODE_ARM)
|
||||
case machine: raise NotImplementedError(f"Capstone disassembly isn't supported for {machine}")
|
||||
for instr in cs.disasm(lib, 0):
|
||||
print(f"{instr.address:#08x}: {instr.mnemonic}\t{instr.op_str}")
|
||||
|
||||
# CPUProgram is a jit/shellcode program that can be just mmapped and jumped to
|
||||
class CPUProgram:
|
||||
helper_handle = ctypes.CDLL(ctypes.util.find_library('System' if OSX else 'gcc_s'))
|
||||
|
||||
def __init__(self, name:str, lib:bytes):
|
||||
# On apple silicon with SPRR enabled (it always is in macos) RWX pages are unrepresentable: https://blog.svenpeter.dev/posts/m1_sprr_gxf/
|
||||
# MAP_JIT allows us to easily flip pages from RW- to R-X and vice versa. It is a noop on intel cpus. (man pthread_jit_write_protect_np)
|
||||
self.mem = mmap(-1, len(lib), MAP_ANON | MAP_PRIVATE | (MAP_JIT if OSX else 0), PROT_READ | PROT_WRITE | PROT_EXEC)
|
||||
|
||||
if OSX: CPUProgram.helper_handle.pthread_jit_write_protect_np(False)
|
||||
self.mem.write(lib)
|
||||
if OSX: CPUProgram.helper_handle.pthread_jit_write_protect_np(True)
|
||||
|
||||
# __clear_cache isn't a normal libc function, but a compiler support routine found in libgcc_s for gcc and compiler-rt for clang.
|
||||
# libgcc_s comes as shared library but compiler-rt is only a bunch of static library archives which we can't directly load, but fortunately
|
||||
# it somehow found its way into libSystem on macos (likely because it used __builtin_clear_cache) and libgcc_s is ~always present on linux
|
||||
# Using ["name"] instead of .name because otherwise name is getting mangled: https://docs.python.org/3.12/reference/expressions.html#index-5
|
||||
CPUProgram.helper_handle["__clear_cache"](ctypes.c_void_p(mv_address(self.mem)), ctypes.c_void_p(mv_address(self.mem) + len(lib)))
|
||||
|
||||
self.fxn = ctypes.CFUNCTYPE(None)(mv_address(self.mem))
|
||||
|
||||
def __call__(self, *bufs, vals=(), wait=False):
|
||||
args = list(bufs) + list(vals)
|
||||
# NOTE: replace this by --target={host's triple}-elf in clang args once we only support macos sequoia and later.
|
||||
# Apple relaxes abi requirement for stack arguments to always be at least 8 byte aligned on arm64
|
||||
# https://developer.apple.com/documentation/xcode/writing-arm64-code-for-apple-platforms
|
||||
# This hack is required because clang/llvm bug doesn't allow us to just use {host's triple}+'-elf' (relocation failures)
|
||||
# The bug was fixed in https://github.com/llvm/llvm-project/commit/454cc36630296262cdb6360b60f90a64a97f7f1a but was only backported to xcode 16+
|
||||
if platform.machine() == "arm64" and OSX: args = args[:8] + [ctypes.c_int64(a) if isinstance(a, int) else a for a in args[8:]]
|
||||
return cpu_time_execution(lambda: self.fxn(*args), enable=wait)
|
||||
def disassemble(self, lib:bytes): return capstone_flatdump(lib)
|
||||
|
||||
class ClangDevice(Compiled):
|
||||
def __init__(self, device:str): super().__init__(device, MallocAllocator, ClangRenderer(), ClangJITCompiler(), CPUProgram)
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import tinygrad.runtime.autogen.libc as libc
|
||||
import struct, tinygrad.runtime.autogen.libc as libc
|
||||
from dataclasses import dataclass
|
||||
from tinygrad.helpers import getbits, i2u
|
||||
|
||||
@@ -51,3 +51,10 @@ def relocate(instr: int, ploc: int, tgt: int, r_type: int):
|
||||
case libc.R_AARCH64_LDST64_ABS_LO12_NC: return instr | (getbits(tgt, 3, 11) << 10)
|
||||
case libc.R_AARCH64_LDST128_ABS_LO12_NC: return instr | (getbits(tgt, 4, 11) << 10)
|
||||
raise NotImplementedError(f"Encountered unknown relocation type {r_type}")
|
||||
|
||||
def jit_loader(obj: bytes) -> bytes:
|
||||
image, _, relocs = elf_loader(obj)
|
||||
# This is needed because we have an object file, not a .so that has all internal references (like loads of constants from .rodata) resolved.
|
||||
for ploc,tgt,r_type,r_addend in relocs:
|
||||
image[ploc:ploc+4] = struct.pack("<I", relocate(struct.unpack("<I", image[ploc:ploc+4])[0], ploc, tgt+r_addend, r_type))
|
||||
return bytes(image)
|
||||
|
||||
Reference in New Issue
Block a user