diff --git a/docs/env_vars.md b/docs/env_vars.md index 58efa542fb..92cd5c7990 100644 --- a/docs/env_vars.md +++ b/docs/env_vars.md @@ -57,6 +57,8 @@ AMD:LLVM | use the AMD device with the LLVM renderer NV:CUDA:sm_70 | use the NV device with the CUDA renderer targetting sm_70 AMD::gfx950 | use the AMD device targetting gfx950 USB+AMD | use the AMD device over the USB interface +CPU:LLVM | use the CPU device with the LLVM renderer +CPU:LLVM:x86_64,znver2,avx2,-avx512f | use the CPU device with the LLVM renderer, with [additional arch flags](runtime.md#cpu-arch) ### Debug breakdown diff --git a/docs/runtime.md b/docs/runtime.md index 257ad0ff0c..ac2d0e7d60 100644 --- a/docs/runtime.md +++ b/docs/runtime.md @@ -10,7 +10,7 @@ tinygrad supports various runtimes, enabling your code to scale across a wide ra | [METAL](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_metal.py) | Utilizes Metal for acceleration on Apple devices | - | M1+ Macs; Metal 3.0+ for `bfloat` support | | [CUDA](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_cuda.py) | Utilizes CUDA for acceleration on NVIDIA GPUs | nvrtc (default)
PTX (`DEV=CUDA:PTX`) | NVIDIA GPU with CUDA support | | [CL](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_cl.py) | Accelerates computations using OpenCL on GPUs | - | OpenCL 2.0 compatible device | -| [CPU](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_cpu.py) | Runs on CPU using the clang or llvm compiler | Clang JIT (default)
LLVM IR (`DEV=CPU:LLVM`) | `clang` compiler in system `PATH` | +| [CPU](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_cpu.py) | Runs on CPU using the clang or llvm compiler | Clang JIT (default)
LLVM IR (`DEV=CPU:LLVM`) | `clang` compiler in system `PATH`
You can specify additional arch parameters via [the `DEV` variable](env_vars.md#dev-variable). See [CPU arch](#cpu-arch) for details. | | [WEBGPU](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_webgpu.py) | Runs on GPU using the Dawn WebGPU engine (used in Google Chrome) | - | Dawn library installed and discoverable. Binaries: [pydawn v0.3.0](https://github.com/wpmed92/pydawn/releases/tag/v0.3.0) | @@ -79,3 +79,13 @@ NV backend supports several interfaces for communicating with devices: * `NVK`: uses the nvidia driver * `PCI`: uses the [NV driver](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/support/nv/nvdev.py) + +## CPU Arch +The CPU renderers may be additionally configured using the arch component of [the `DEV` environment variable](env_vars.md#dev-variable). +CPU arch should be specified as a comma-separated list of parameters, and must contain at least two values: the architecture family (ie. x86_64, arm64, or riscv64) and the cpu type (as accepted by `clang`'s `-march`). +If native is specified as the cpu type, tinygrad (or delegate compiler) will query the host cpu type. Additional comma-separated values may be specified as follows: + +* `AMX`: emit Apple silicon AMX instructions + +All other additional values are interpreted as cpu feature flags. When a value is preceded by a `-` character, the corresponding feature flag will be disabled, otherwise the flag will be enabled. +Note that enabled feature flags should not be preceded by a `+`. diff --git a/test/null/test_device.py b/test/null/test_device.py index 44716240d7..90f8588fef 100644 --- a/test/null/test_device.py +++ b/test/null/test_device.py @@ -132,6 +132,7 @@ class TestDevVar(unittest.TestCase): for d, t in [("AMD", Target(device="AMD", renderer="")), ("AMD:LLVM", Target(device="AMD", renderer="LLVM")), (":LLVM", Target(device="", renderer="LLVM")), ("AMD::gfx1100", Target(device="AMD", arch="gfx1100")), ("AMD:LLVM:gfx1100", Target(device="AMD", renderer="LLVM", arch="gfx1100")), ("::gfx1100", Target(arch="gfx1100")), + ("CPU:LLVM:arm64,native,AMX", Target(device="CPU", renderer="LLVM", arch="arm64,native,AMX")), ("USB+", Target(interface="USB")), ("USB+AMD", Target(device="AMD", interface="USB")), ("PCI:0+AMD", Target(device="AMD", interface="PCI", indices="0")), (":0+AMD", Target(device="AMD", indices="0")), ("PCI:0,1+AMD", Target(device="AMD", interface="PCI", indices="0,1")), diff --git a/test/null/test_elf.py b/test/null/test_elf.py index c8ab282085..f7d350bd34 100644 --- a/test/null/test_elf.py +++ b/test/null/test_elf.py @@ -23,7 +23,7 @@ class TestElfLoader(unittest.TestCase): } ''' with self.assertRaisesRegex(RuntimeError, 'evil_external_function'): - ClangJITCompiler({'AMD64':'x86_64', 'aarch64':'arm64'}.get(m:=platform.machine(), m)+",native").compile(src) + ClangJITCompiler([{'AMD64':'x86_64', 'aarch64':'arm64'}.get(m:=platform.machine(), m), "native"]).compile(src) def test_link(self): src = ''' float powf(float, float); // from libm diff --git a/test/opt/test_gen_float4.py b/test/opt/test_gen_float4.py index 8e8111c7b0..61f66f2ecf 100644 --- a/test/opt/test_gen_float4.py +++ b/test/opt/test_gen_float4.py @@ -3,9 +3,11 @@ from tinygrad import Device, Tensor, dtypes from tinygrad.uop.ops import UOp, Ops from tinygrad.codegen.opt import Opt, OptOps from tinygrad.engine.realize import get_program -from tinygrad.helpers import AMX +from tinygrad.helpers import DEV from test.helpers import replace_opts +AMX = "AMX" in DEV.arch + @unittest.skipUnless(Device[Device.DEFAULT].renderer.supports_float4, "need backends that support float4") class TestFloat4(unittest.TestCase): @staticmethod diff --git a/test/opt/test_tensor_cores.py b/test/opt/test_tensor_cores.py index 736c26d110..2d04574441 100644 --- a/test/opt/test_tensor_cores.py +++ b/test/opt/test_tensor_cores.py @@ -7,7 +7,7 @@ from tinygrad.tensor import _to_np_dtype from tinygrad.uop.ops import Ops from tinygrad.dtype import DType from tinygrad.device import is_dtype_supported -from tinygrad.helpers import AMX, DEV, Context +from tinygrad.helpers import DEV, Context from test.helpers import slow, replace_opts from tinygrad.engine.realize import CompiledRunner, get_program from tinygrad.codegen.opt import Opt, OptOps, KernelOptError @@ -18,6 +18,8 @@ from test.backend.test_linearizer import helper_realized_ast, helper_linearizer_ # NOTE: get_program always passes in Device[Device.DEFAULT].renderer explicitly for process_replay!!! +AMX = "AMX" in DEV.arch + def helper_tc_ensure_uops_and_opts_count(N: int, M:int, K:int, dtype_in:DType, dtype_out:DType, axis:int=0, tc_select:int=-1, tc_opt:int=0, ensure_triggered:bool=True): a, b = Tensor.rand(M, K, dtype=dtype_in), Tensor.rand(K, N, dtype=dtype_in) diff --git a/tinygrad/codegen/late/devectorizer.py b/tinygrad/codegen/late/devectorizer.py index a4def64c0a..992dd9bd3f 100644 --- a/tinygrad/codegen/late/devectorizer.py +++ b/tinygrad/codegen/late/devectorizer.py @@ -5,7 +5,7 @@ from dataclasses import dataclass from tinygrad.dtype import dtypes, ImageDType, DType, AddrSpace, Invalid, PtrDType from tinygrad.uop.ops import UOp, Ops, UPat, PatternMatcher, GroupOp, identity_element from tinygrad.uop.symbolic import uop_given_valid, parse_valid, invalid_gate -from tinygrad.helpers import getenv, flatten, AMX, prod +from tinygrad.helpers import getenv, flatten, prod from tinygrad.renderer import Renderer # ***** image load valid simplification ***** @@ -171,7 +171,7 @@ def split_load_store(ctx:Renderer|None, ls:UOp, idx:UOp): lengths = [4] elif ctx is not None and ctx.supports_float4: # TODO: a better way to get this than ctx - lengths = [8,4,2] if buf.dtype.base == dtypes.half and getenv("ALLOW_HALF8") else ([16,8,4,2] if AMX else [4,2]) + lengths = [8,4,2] if buf.dtype.base == dtypes.half and getenv("ALLOW_HALF8") else ([16,8,4,2] if "AMX" in ctx.target.arch else [4,2]) lengths.append(1) # worst case, it's not folded # filter fold lengths that don't divide diff --git a/tinygrad/codegen/opt/heuristic.py b/tinygrad/codegen/opt/heuristic.py index 7a5b5cee89..1049b3bbf2 100644 --- a/tinygrad/codegen/opt/heuristic.py +++ b/tinygrad/codegen/opt/heuristic.py @@ -1,6 +1,6 @@ import itertools from tinygrad.codegen.opt import Opt, OptOps, KernelOptError -from tinygrad.helpers import getenv, DEBUG, prod, NOLOCALS, TC_OPT, TC_SELECT, USE_TC, AMX, IMAGE +from tinygrad.helpers import getenv, DEBUG, prod, NOLOCALS, TC_OPT, TC_SELECT, USE_TC, IMAGE from tinygrad.dtype import PtrDType, ImageDType from tinygrad.uop.ops import Ops, resolve, AxisType from tinygrad.codegen.opt.postrange import Scheduler @@ -34,7 +34,7 @@ def hand_coded_optimizations(k:Scheduler) -> Scheduler: except KernelOptError: pass # skip hand-coded TC opts if AMX, upcasting will make kernel slower - if good_tc_opt and not AMX: + if good_tc_opt and "AMX" not in k.ren.target.arch: if rngs is not None: for tc_dim in [1,0]: # attempt to upcast M and N szs = [sz for sz in [5,4,3,2] if rngs[tc_dim].src[0].divides(sz) is not None] diff --git a/tinygrad/helpers.py b/tinygrad/helpers.py index 326d42b207..2cc58cc68d 100644 --- a/tinygrad/helpers.py +++ b/tinygrad/helpers.py @@ -232,7 +232,7 @@ DEV, DEBUG, BEAM, NOOPT = _DEV("DEV", ""), ContextVar("DEBUG", 0), ContextVar("B IMAGE, FLOAT16, OPENPILOT_HACKS = ContextVar("IMAGE", 0), ContextVar("FLOAT16", 0), ContextVar("OPENPILOT_HACKS", 0) JIT, JIT_BATCH_SIZE = ContextVar("JIT", 2 if OSX and ARCH_X86 else 1), ContextVar("JIT_BATCH_SIZE", 32) WINO, CAPTURING, TRACEMETA, NO_COLOR = ContextVar("WINO", 0), ContextVar("CAPTURING", 1), ContextVar("TRACEMETA", 1), ContextVar("NO_COLOR", 0) -USE_TC, TC_SELECT, TC_OPT, AMX = ContextVar("TC", 1), ContextVar("TC_SELECT", -1), ContextVar("TC_OPT", 0), ContextVar("AMX", 0) +USE_TC, TC_SELECT, TC_OPT = ContextVar("TC", 1), ContextVar("TC_SELECT", -1), ContextVar("TC_OPT", 0) TRANSCENDENTAL, NOLOCALS = ContextVar("TRANSCENDENTAL", 1), ContextVar("NOLOCALS", 0) SPLIT_REDUCEOP, NO_MEMORY_PLANNER, LRU = ContextVar("SPLIT_REDUCEOP", 1), ContextVar("NO_MEMORY_PLANNER", 0), ContextVar("LRU", 1) RING, ALL2ALL, ALLREDUCE_CAST = ContextVar("RING", 1), ContextVar("ALL2ALL", 0), ContextVar("ALLREDUCE_CAST", 1) diff --git a/tinygrad/renderer/cstyle.py b/tinygrad/renderer/cstyle.py index 98b3118d50..cc4ba86222 100644 --- a/tinygrad/renderer/cstyle.py +++ b/tinygrad/renderer/cstyle.py @@ -3,7 +3,7 @@ import math, sys, struct from collections import defaultdict, Counter from tinygrad.codegen.opt import tc from tinygrad.uop.ops import GroupOp, Ops, UOp, PatternMatcher, UPat, range_str, axis_letters -from tinygrad.helpers import strip_parens, getenv, prod, dedup, Target, AMX, CPU_COUNT +from tinygrad.helpers import strip_parens, getenv, prod, dedup, Target, CPU_COUNT from tinygrad.dtype import ImageDType, dtypes, DType, PtrDType, AddrSpace, truncate, float_to_bf16 from tinygrad.renderer import Renderer from tinygrad.codegen.late.devectorizer import no_vectorized_alu @@ -226,7 +226,6 @@ class ClangRenderer(CStyleLanguage): global_max = (CPU_COUNT.value, 0, 0) infinity = "__builtin_inff()" nan = '__builtin_nanf("")' - if AMX: tensor_cores = tc.amx # language options buffer_suffix = " restrict" @@ -280,7 +279,8 @@ class ClangJITRenderer(ClangRenderer): def __init__(self, target:Target): super().__init__(target) from tinygrad.runtime.support.compiler_cpu import ClangJITCompiler - self.compiler = ClangJITCompiler(target.arch) + if "AMX" in target.arch: self.tensor_cores = tc.amx + self.compiler = ClangJITCompiler([x for x in target.arch.split(",") if x != "AMX"]) class OpenCLRenderer(CStyleLanguage): has_aux = True diff --git a/tinygrad/renderer/llvmir.py b/tinygrad/renderer/llvmir.py index d21ef31319..e7b2d1b358 100644 --- a/tinygrad/renderer/llvmir.py +++ b/tinygrad/renderer/llvmir.py @@ -6,7 +6,7 @@ from tinygrad.renderer.cstyle import HIPRenderer, create_non_native_float_pats, from tinygrad.uop.decompositions import xexp2, xlog2 from tinygrad.uop.ops import UOp, PatternMatcher, UPat, Ops, GroupOp, range_str from tinygrad.dtype import dtypes, float_to_fp8, DType, PtrDType, truncate -from tinygrad.helpers import prod, Target, AMX, CPU_COUNT, getenv +from tinygrad.helpers import prod, Target, CPU_COUNT, getenv def ldt(dt:DType): if dt.vcount > 1: return f"<{dt.vcount} x {ldt(dt.scalar())}>" @@ -134,7 +134,6 @@ class LLVMRenderer(Renderer): abi: str | None string_rewrite: PatternMatcher code_for_op = {k:lambda:None for v in lop.values() for k in v.keys()} - if AMX: tensor_cores = tc.amx extra_matcher = create_non_native_float_pats((dtypes.bfloat16,)) + pm_manual_bf16_cast def _render_fn(self, name:str, args:list[tuple[str,DType]], kernel:list[str], prefix:list[str]|None=None) -> str: @@ -149,7 +148,7 @@ class LLVMRenderer(Renderer): local_args: list[str] = [] for u in uops: - if AMX and u.op is Ops.WMMA: # prealloc aux buffers as AMX can only load from memory + if self.tensor_cores == tc.amx and u.op is Ops.WMMA: # prealloc aux buffers as AMX can only load from memory vc += 1 r[u] = f"%wmma{vc}" for i, dtype in enumerate(u.arg[2].vec(sz) for sz in [prod(size for _, size in upcast) for upcast in u.arg[6]]): @@ -204,7 +203,8 @@ class CPULLVMRenderer(LLVMRenderer): def __init__(self, target:Target): super().__init__(target) from tinygrad.runtime.support.compiler_cpu import CPULLVMCompiler - self.compiler = CPULLVMCompiler(target.arch) + if "AMX" in target.arch: self.tensor_cores = tc.amx + self.compiler = CPULLVMCompiler([x for x in target.arch.split(",") if x != "AMX"]) barrier = 'fence syncscope("workgroup") release\ntail call void @llvm.amdgcn.s.barrier()\nfence syncscope("workgroup") acquire\n' code_for_workitem = {"g": lambda x: f"tail call i32 @llvm.amdgcn.workgroup.id.{chr(120+int(x))}()", diff --git a/tinygrad/runtime/support/compiler_cpu.py b/tinygrad/runtime/support/compiler_cpu.py index d468ef668e..2777d25b48 100644 --- a/tinygrad/runtime/support/compiler_cpu.py +++ b/tinygrad/runtime/support/compiler_cpu.py @@ -5,17 +5,17 @@ from tinygrad.runtime.support.elf import jit_loader from tinygrad.runtime.autogen import llvm class ClangJITCompiler(Compiler): - def __init__(self, arch, cachekey="compile_clang_jit"): - self.arch, cpu, feats = (sp:=arch.split(',', 2)) + [""] * (3 - len(sp)) + def __init__(self, arch:list[str], cachekey="compile_clang_jit"): + self.arch, cpu, *feats = arch assert self.arch and cpu, f"invalid arch string: {arch!r}, expected ',,[]' (eg. 'x86_64,znver2')" match self.arch: - case "x86_64": self.args = [f"-march={cpu}"] + [f"-mno{f}" if f.startswith("-") else f"-m{f}" for f in feats.split(',') if f] + case "x86_64": self.args = [f"-march={cpu}"] + [f"-mno{f}" if f.startswith("-") else f"-m{f}" for f in feats] # on arm march means "runs on this arch and superset" instead of "optimize for this arch". x86 march == arm mcpu # x18 is a reserved platform register. It is clobbered on context switch in macos and is used to store TEB pointer in windows on arm - case "arm64": self.args = ["-ffixed-x18", "-mcpu=" + "+".join([cpu] + ["no"+f[1:] if f.startswith("-") else f for f in feats.split(',') if f])] - case "riscv64": self.args = ["-march=" + "_".join(["rv64g" if cpu == "native" else cpu] + [f for f in feats.split(',') if f])] + case "arm64": self.args = ["-ffixed-x18", "-mcpu=" + "+".join([cpu] + ["no"+f[1:] if f.startswith("-") else f for f in feats])] + case "riscv64": self.args = ["-march=" + "_".join(["rv64g" if cpu == "native" else cpu] + feats)] case _: raise RuntimeError(f"unsupported arch: {self.arch!r}") - super().__init__(f"{cachekey}_{arch}") + super().__init__(f"{cachekey}_{'_'.join(arch)}") def compile_to_obj(self, src:str) -> bytes: """Compile C source to ELF object file (before linking).""" @@ -91,14 +91,14 @@ class LLVMCompiler(Compiler): class CPULLVMCompiler(LLVMCompiler): - def __init__(self, arch, cache_key=None): - self.arch, cpu, feats = (sp:=arch.split(',', 2)) + [""] * (3 - len(sp)) + def __init__(self, arch:list[str], cache_key=None): + self.arch, cpu, *feats = arch assert self.arch and cpu, f"invalid arch string: {arch!r}, expected ',,[]' (eg. 'x86_64,znver2')" - feats = ','.join(f if f.startswith('-') else '+'+f for f in feats.split(',') if f) + featstr = ','.join(f if f.startswith('-') else '+'+f for f in feats) if cpu == "native": cpu = ctypes.string_at(llvm.LLVMGetHostCPUName()).decode() - feats = (feats + "," if feats else "") + ctypes.string_at(llvm.LLVMGetHostCPUFeatures()).decode() + featstr = (featstr + "," if featstr else "") + ctypes.string_at(llvm.LLVMGetHostCPUFeatures()).decode() # +reserve-x18 here does the same thing as -ffixed-x18 in ClangJITCompiler, see comments there for why it's needed on arm osx - super().__init__(self.arch, cpu, ('+reserve-x18,' if self.arch == "arm64" else '') + feats, cache_key) + super().__init__(self.arch, cpu, ('+reserve-x18,' if self.arch == "arm64" else '') + featstr, cache_key) def disassemble(self, lib:bytes): capstone_flatdump(lib, self.arch) diff --git a/tinygrad/runtime/support/compiler_mesa.py b/tinygrad/runtime/support/compiler_mesa.py index 2479bf0b1f..922605a9f1 100644 --- a/tinygrad/runtime/support/compiler_mesa.py +++ b/tinygrad/runtime/support/compiler_mesa.py @@ -17,7 +17,7 @@ def deserialize(enc_src, opts): return mesa.nir_deserialize(None, ctypes.cast(opts, ctypes.POINTER(mesa.nir_shader_compiler_options)), blobreader) class LVPCompiler(CPULLVMCompiler): - def __init__(self, arch): CPULLVMCompiler.__init__(self, arch, cache_key="compile_lvp") + def __init__(self, arch): CPULLVMCompiler.__init__(self, arch.split(","), cache_key="compile_lvp") def compile(self, src) -> bytes: shader, ctx = deserialize(src, mesa.lvp_nir_options), llvm.LLVMGetGlobalContext()