Files
tinygrad/tinygrad/runtime/ops_cpu.py
ttomsa aa1e59ab97 X86 with Ops.INS (#14873)
* draft

* cleanup test_encodings

* cleanup test_isel

* model flag state and support rematerialization

* woops

* add vbroadcastss instruction

* don't fuse load if used multiple times in src

* add movabs instruction and fix idiv

* fixes

* add x86 backend to tests

* float16 fix

* rm TwoAddress2nd

* add BARRIER

* test windows ci

* yup isel fixes the mask stuff too and its beautiful

* add cmoves to the spec

* support storing imms

* no TUPLE_ORDER, breaks tests

* fix remaining seg faults

* add float max

* always fuse index

* minor

* fix DEFINE_VAR/SPECIAL and enable multithreading

* linter

* more linter

* more

* more

* more

* let's try this

* perhaps

* start new scheduler

* more scheduling info

* cleaner shuffle functions

* fixup isel tests

* skip bounds check when NOOPs exist

* skip inf rewrite tests

* fix const tag hack and add x86ops to _shape

* fix

* skip a few tests

* func arg order independent from op value

* x86 goes in own linearize

* switch to PARAM

* more

* add min x86op and neg in decomps

* do mulacc in isel

* use def_reg in test_encodings

* enable emulated int64 tests

* how much does this fix

* Ops becomes OpType

* fix

* rm noqa

* rm machine scheduler stuff

* and this

* allow for extending enums and move X86Ops out of uop

* fix imports

* rm X86GroupOp from ops.py

* spacing

* tell mypy to shut up

* more linter

* add x86op test

* allow set[X86Ops] in upat

* move NOOPs to pre_isel_matcher and rm NOOP from spec

* more asserts

* also this

* cleanup encode

* simplify live range

* fix idiv

* add Ops.INS to x86

* more changes

* more changes

* more changes

* fix

* fix

* fix

* fix

* print formatted assembly

* fix 8bit idiv?

* oops

* enable float16  and unaligned vector load/store

* actually no

* move x86 tests

* no more bool cast

* fix

* linter

* linter

* move X86Ops to x86.py

* fix vpbroadcast

* cleanups

* linter

* print correct reg names

* canonical max

* move max/min and add test

* support float16 vector load/store

* rm bad rewrite

* vpsrldq can't access memory

* regalloc takes renderer

* enable vector load/store on all dtypes

* more isel tests

* rm this for now

* a lot better

* fix

* fix

* fix

* deal with flags correctly

* fix

* enable gep noop rule

* fix

* fix

* fix

* add callee saved registers

* use Ops.CONST instead of X86Ops.IMM

* fix

* enable TUPLE_ORDER

* fix

* rm x86 code in linearizer

* fix

* fix

* fix

* move isa rewrites to codegen

* fix

* fix

* skip test_linearizer.py

* skip more tests

* fix

* fix for idiv/mod changes

* fix

* don't use fmadd if it duplicates fused op

* hacky

* fix

* cleanups

* cleanups

* fix

---------

Co-authored-by: George Hotz <72895+geohot@users.noreply.github.com>
2026-05-19 12:42:54 -07:00

143 lines
8.2 KiB
Python

from __future__ import annotations
import platform, sys, ctypes, functools, time, mmap, threading, queue
from tinygrad.helpers import to_mv, OSX, WIN, mv_address, suppress_finalizing, unwrap, data64_le
from tinygrad.device import BufferSpec
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, HCQArgsState, HCQSignal, HCQProgram, MMIOInterface
from tinygrad.runtime.support.hcq import CLikeArgsState
from tinygrad.renderer.cstyle import ClangJITRenderer
from tinygrad.renderer.llvmir import CPULLVMRenderer
from tinygrad.renderer.nir import LVPRenderer
from tinygrad.renderer.isa.x86 import X86Renderer
from tinygrad.runtime.support.elf import jit_loader
from tinygrad.uop.ops import sint
class CPUSignal(HCQSignal):
def _sleep(self, time_spent_since_last_sleep_ms:int):
if self.is_timeline and self.owner is not None:
self.owner.tasks.join()
if self.owner.error_state is not None: raise self.owner.error_state
class CPUWorker(threading.Thread):
def __init__(self, dev, tasks, thread_id):
super().__init__()
self.dev, self.tasks, self.thread_id, self.pool, self.daemon = dev, tasks, thread_id, [], True
def push_task(self, tid, cmd, args):
if len(self.pool) <= tid:
self.pool.append(queue.Queue())
CPUWorker(self, self.pool[tid], thread_id=tid+1).start()
self.pool[tid].put([cmd, 1, len(args)] + args)
def run(self):
while True:
cmd_iter = iter(self.tasks.get())
try:
for cmd in cmd_iter:
threads, args_cnt = next(cmd_iter), next(cmd_iter)
args = [next(cmd_iter) for _ in range(args_cnt)]
for th in range(threads - 1): self.push_task(th, cmd, args)
cmd(self.thread_id, *args)
for th in range(threads - 1): self.pool[th].join()
except Exception as e: self.dev.error_state = e
finally: self.tasks.task_done()
class CPUComputeQueue(HWQueue):
def _exec(self, tid, prg, bufs, *args):
vals = list(args[bufs:])
if 'core_id' in prg.runtimevars: vals[prg.runtimevars['core_id']] = tid
prg.fxn(*map(ctypes.c_uint64, args[:bufs]), *map(ctypes.c_int64 if platform.machine().lower() == "arm64" else ctypes.c_int32, vals))
def _signal(self, tid, signal_addr, value): to_mv(signal_addr, 4).cast('I')[0] = value
def _wait(self, tid, tmpl_sig, signal_addr, value):
tmpl_sig.base_buf = HCQBuffer(signal_addr, 16, view=MMIOInterface(signal_addr, 16))
tmpl_sig.wait(value)
def _timestamp(self, tid, timestamp_addr): to_mv(timestamp_addr, 8).cast('Q')[0] = time.perf_counter_ns()
def cmd(self, cmd, *args, threads=1):
self.q(cmd, threads, len(args), *args)
return self
def memory_barrier(self): return self
def exec(self, prg:CPUProgram, args_state:HCQArgsState, global_size, local_size):
if isinstance(args_state, LVPArgsState):
self.bind_args_state(args_state)
return self.cmd(self._exec, prg, 1, args_state.buf.va_addr)
return self.cmd(self._exec, prg, len(args_state.bufs), *[x.va_addr for x in args_state.bufs], *args_state.vals, threads=(global_size or (1,))[0])
def wait(self, signal, value=0): return self.cmd(self._wait, type(signal)(signal.base_buf, owner=signal.owner, virt=True), signal.value_addr, value)
def timestamp(self, signal): return self.cmd(self._timestamp, signal.timestamp_addr)
def signal(self, signal, value:sint=0): return self.cmd(self._signal, signal.value_addr, value)
def _submit(self, dev): dev.tasks.put(self._q[:])
class LVPArgsState(CLikeArgsState):
def __init__(self, buf, prg, bufs, vals=()): super().__init__(buf, prg, bufs, vals, [*data64_le(buf.va_addr + 12), (len(bufs) + len(vals)) * 2])
# NOTE: MAP_JIT is added to mmap module in python 3.13
MAP_JIT = 0x0800
class CPUProgram(HCQProgram):
rt_lib = None
try: rt_lib = ctypes.CDLL(ctypes.util.find_library('System' if OSX else 'kernel32') if OSX or WIN else 'libgcc_s.so.1')
except OSError: pass
def __init__(self, dev, name:str, lib:bytes, runtimevars:dict[str, int]|None=None, **kwargs):
self.runtimevars = runtimevars or {}
LVP = isinstance(dev.renderer, LVPRenderer)
if sys.platform == "win32": # mypy doesn't understand when WIN is used here
PAGE_EXECUTE_READWRITE, MEM_COMMIT, MEM_RESERVE = 0x40, 0x1000, 0x2000
ctypes.windll.kernel32.VirtualAlloc.restype = ctypes.c_void_p
self.mem = ctypes.windll.kernel32.VirtualAlloc(ctypes.c_void_p(0), ctypes.c_size_t(len(lib)), MEM_COMMIT | MEM_RESERVE, PAGE_EXECUTE_READWRITE)
ctypes.memmove(self.mem, lib, len(lib))
ctypes.windll.kernel32.GetCurrentProcess.restype = ctypes.c_void_p
proc = ctypes.windll.kernel32.GetCurrentProcess()
ctypes.windll.kernel32.FlushInstructionCache(ctypes.c_void_p(proc), ctypes.c_void_p(self.mem), ctypes.c_size_t(len(lib)))
self.fxn = ctypes.CFUNCTYPE(None)(self.mem)
else:
# On apple silicon with SPRR enabled (it always is in macos) RWX pages are unrepresentable: https://blog.svenpeter.dev/posts/m1_sprr_gxf/
# MAP_JIT allows us to easily flip pages from RW- to R-X and vice versa. It is a noop on intel cpus. (man pthread_jit_write_protect_np)
self.mem = mmap.mmap(-1, len(lib), mmap.MAP_ANON|mmap.MAP_PRIVATE|(MAP_JIT if OSX else 0), mmap.PROT_READ|mmap.PROT_WRITE|mmap.PROT_EXEC)
if OSX: unwrap(CPUProgram.rt_lib).pthread_jit_write_protect_np(False)
if LVP: lib = jit_loader(lib, base=ctypes.addressof(ctypes.c_void_p.from_buffer(self.mem)), link_libs=['m'])
self.mem.write(lib)
if OSX: unwrap(CPUProgram.rt_lib).pthread_jit_write_protect_np(True)
# __clear_cache isn't a normal libc function, but a compiler support routine found in libgcc_s for gcc and compiler-rt for clang.
# libgcc_s comes as shared library but compiler-rt is only a bunch of static library archives which we can't directly load, but fortunately
# it somehow found its way into libSystem on macos (likely because it used __builtin_clear_cache) and libgcc_s is ~always present on linux
# Using ["name"] instead of .name because otherwise name is getting mangled: https://docs.python.org/3.12/reference/expressions.html#index-5
if CPUProgram.rt_lib is not None:
CPUProgram.rt_lib["__clear_cache"](ctypes.c_void_p(mv_address(self.mem)), ctypes.c_void_p(mv_address(self.mem) + len(lib)))
else:
# msync should be a universal POSIX way to do this
from tinygrad.runtime.autogen import libc
libc.msync(ctypes.c_void_p(mv_address(self.mem)), len(lib), libc.MS_SYNC | libc.MS_INVALIDATE)
self.fxn = ctypes.CFUNCTYPE(None)(mv_address(self.mem))
super().__init__(LVPArgsState if LVP else HCQArgsState, dev, name, kernargs_alloc_size=12+256 if LVP else 0)
@suppress_finalizing
def __del__(self):
if sys.platform == 'win32': ctypes.windll.kernel32.VirtualFree(ctypes.c_void_p(self.mem), ctypes.c_size_t(0), 0x8000) #0x8000 - MEM_RELEASE
class CPUAllocator(HCQAllocator):
def __init__(self, dev:CPUDevice): super().__init__(dev, supports_copy_from_disk=False, supports_transfer=False)
def _alloc(self, size:int, options:BufferSpec) -> HCQBuffer:
if options.external_ptr is not None: addr, buf = options.external_ptr, None
elif WIN: addr = mv_address(buf:=mmap.mmap(-1, size, access=mmap.ACCESS_WRITE))
else: addr = mv_address(buf:=mmap.mmap(-1, size, mmap.MAP_ANON | mmap.MAP_SHARED, mmap.PROT_READ | mmap.PROT_WRITE))
return HCQBuffer(va:=addr, sz:=size, meta=buf, view=MMIOInterface(va, sz, fmt='B'), owner=self.dev)
def _as_buffer(self, src) -> memoryview:
self.dev.synchronize()
return to_mv(src.va_addr, src.size)
def _map(self, buf:HCQBuffer):
if buf.view is None or not isinstance(buf.view, MMIOInterface): raise RuntimeError("Cannot map buffer without view to cpu")
return HCQBuffer(buf.view.addr, buf.size, view=buf.view, owner=buf.owner)
def _unmap(self, mb): pass # CPU _map returns a view wrapper, nothing to release
class CPUDevice(HCQCompiled):
def __init__(self, device:str=""):
self.tasks:queue.Queue = queue.Queue()
CPUWorker(self, self.tasks, thread_id=0).start()
super().__init__(device, CPUAllocator(self), [ClangJITRenderer, CPULLVMRenderer, LVPRenderer, X86Renderer], functools.partial(CPUProgram, self),
CPUSignal, CPUComputeQueue, arch={'amd64':'x86_64', 'aarch64':'arm64'}.get(m:=platform.machine().lower(), m)+",native")