remove DEVECTORIZE (#16290)

* remove DEVECTORIZE

* fully remove DEVECTORIZE
This commit is contained in:
George Hotz
2026-05-20 13:25:49 -07:00
committed by GitHub
parent 825f30bf18
commit 58d58c1659
10 changed files with 16 additions and 39 deletions

View File

@@ -561,27 +561,6 @@ jobs:
# ****** Feature Tests ******
testdevectorize:
name: Linux (devectorize)
runs-on: ubuntu-24.04
timeout-minutes: 15
steps:
- name: Checkout Code
uses: actions/checkout@v6
- name: Setup Environment
uses: ./.github/actions/setup-tinygrad
with:
key: devectorize-minimal
deps: testing_unit
pydeps: "pillow"
llvm: "true"
- name: Test LLVM=1 DEVECTORIZE=0
run: DEV=CPU:LLVM DEVECTORIZE=0 python3 -m pytest -n auto test/test_tiny.py test/backend/test_ops.py
- name: Test LLVM=1 DEVECTORIZE=0 for model
run: DEV=CPU:LLVM DEVECTORIZE=0 python3 test/models/test_efficientnet.py
- name: Test DEV=CPU DEVECTORIZE=0
run: DEV=CPU DEVECTORIZE=0 python3 -m pytest -n auto test/test_tiny.py test/backend/test_ops.py
testdsp:
name: Linux (DSP)
runs-on: ubuntu-24.04

View File

@@ -174,7 +174,7 @@ if __name__ == "__main__":
# *** render to device ***
from tinygrad.codegen import to_program
with Context(PCONTIG=2, DEVECTORIZE=2, SPEC=0):
with Context(PCONTIG=2, SPEC=0):
out = tree_traversal(forest_t, val_t, height, rounds)
sink = out.schedule_linear().src[-1].src[0]
prg = to_program(sink, VLIWRenderer())

View File

@@ -122,7 +122,7 @@ def eval_custom_matmul(fxn, dt=dtypes.float):
with Context(DEBUG=0): Tensor.realize(a, b)
ets = []
with Context(DEBUG=max(2, DEBUG.value), DEVECTORIZE=2 if dt == dtypes.half else 0):
with Context(DEBUG=max(2, DEBUG.value)):
for _ in range(NUM_RUNS):
GlobalCounters.reset()
tst = Tensor.custom_kernel(c, a, b, fxn=fxn)[0].realize()

View File

@@ -218,7 +218,7 @@ if __name__ == "__main__":
ref.realize()
GlobalCounters.reset()
with Context(DEBUG=max(2, DEBUG.value), DEVECTORIZE=2):
with Context(DEBUG=max(2, DEBUG.value)):
tst = Tensor.custom_kernel(c, a, b, fxn=custom_gemm)[0]
tst.realize()
print(f"{(N*M*K*2 / GlobalCounters.time_sum_s)*1e-12:.2f} REAL TFLOPS")

View File

@@ -127,7 +127,7 @@ if __name__ == "__main__":
GlobalCounters.reset()
with Context(DEBUG=max(2, DEBUG.value), DEVECTORIZE=2):
with Context(DEBUG=max(2, DEBUG.value)):
tst = Tensor.custom_kernel(c, a, b, fxn=custom_gemm)[0]
tst.realize()
print(f"{(N*M*K*2 / GlobalCounters.time_sum_s)*1e-12:.2f} REAL TFLOPS")

View File

@@ -2449,7 +2449,7 @@ class TestOps(unittest.TestCase):
@unittest.skipUnless(Device.DEFAULT == "CPU" and DEV.renderer == "LLVM", "DEVECTORIZE=0 only for LLVM")
def test_strided_conv2d_simple_vec(self):
with Context(DEVECTORIZE=0): self.test_strided_conv2d_simple()
self.test_strided_conv2d_simple()
@slow_test
def test_strided_conv2d(self):

View File

@@ -8,7 +8,7 @@ class TestLinearizerRewrite(unittest.TestCase):
def test_reduction(self):
t = Tensor.ones((64,64), device="NULL").contiguous().realize()
out = (t*2).sum(axis=1)
with Context(SPLIT_REDUCEOP=0, DEVECTORIZE=0):
with Context(SPLIT_REDUCEOP=0):
si = out.schedule_linear().src[-1]
opts_to_apply = []
opts_to_apply.append(Opt(OptOps.UPCAST, 0, 4))
@@ -19,7 +19,7 @@ class TestLinearizerRewrite(unittest.TestCase):
def test_arange(self):
out = Tensor.arange(32, device="NULL")
with Context(SPLIT_REDUCEOP=0, DEVECTORIZE=0):
with Context(SPLIT_REDUCEOP=0):
si = out.schedule_linear().src[-1]
opts_to_apply = []
opts_to_apply.append(Opt(OptOps.UPCAST, 0, 4))

View File

@@ -1,7 +1,7 @@
from typing import cast
from dataclasses import replace
import itertools
from tinygrad.helpers import DISABLE_FAST_IDIV, DEVECTORIZE, TRANSCENDENTAL, SPEC, DEBUG, VIZ, IMAGE, NOOPT, EMULATED_DTYPES, NOLOCALS, USE_TC
from tinygrad.helpers import DISABLE_FAST_IDIV, TRANSCENDENTAL, SPEC, DEBUG, VIZ, IMAGE, NOOPT, EMULATED_DTYPES, NOLOCALS, USE_TC
from tinygrad.helpers import ALLOW_TF32, TracingKey, Context, panic
from tinygrad.uop.ops import PatternMatcher, graph_rewrite, UOp, pm_lower_index_dtype, Ops, UPat, track_rewrites, KernelInfo, ProgramInfo
from tinygrad.uop.render import pyrender
@@ -15,7 +15,7 @@ from tinygrad.codegen.gpudims import pm_add_gpudims
from tinygrad.uop.symbolic import sym, symbolic_simple, gep_pushing, symbolic, pm_move_where_on_load
from tinygrad.uop.decompositions import get_late_rewrite_patterns, get_transcendental_patterns, pm_dtype_decomps
from tinygrad.codegen.late.expander import expander, pm_pre_expander, pm_group_for_reduce
from tinygrad.codegen.late.devectorizer import load_store_folding, load_store_indexing, devectorize, pm_reduce, \
from tinygrad.codegen.late.devectorizer import load_store_folding, load_store_indexing, devectorize_buf_and_index, devectorize_alu, pm_reduce, \
ReduceContext, correct_load_store, pm_render, pm_add_loads, pm_make_images
from tinygrad.codegen.opt.postrange import apply_opts
from tinygrad.codegen.late.gater import pm_move_gates_from_index
@@ -74,11 +74,9 @@ def full_rewrite_to_sink(ast:UOp, ren:Renderer, optimize:bool=True) -> UOp:
if IMAGE and ren.target.device in {"QCOM", "CL", "PYTHON", "NULL"}:
sink = graph_rewrite(sink, pm_make_images, name="create image buffers", bottom_up=True, ctx=ren.target.arch)
# devectorize (TODO: does this need opts?)
if DEVECTORIZE >= 2: pm_devectorize = sym+load_store_folding+load_store_indexing
elif DEVECTORIZE: pm_devectorize = sym+devectorize+load_store_folding+correct_load_store+load_store_indexing
else: pm_devectorize = sym+load_store_folding+correct_load_store+load_store_indexing
if DEVECTORIZE >= 0: sink = graph_rewrite(sink, pm_devectorize, ctx=ren, name="devectorize")
# devectorize
sink = graph_rewrite(sink, sym+devectorize_alu+devectorize_buf_and_index+load_store_folding+correct_load_store+load_store_indexing,
ctx=ren, name="devectorize")
# lower the index dtype to a concrete int
sink = graph_rewrite(sink, pm_lower_index_dtype+load_store_indexing+gep_pushing, name="lower all index dtypes")
@@ -204,7 +202,7 @@ def do_to_program(ast:UOp, renderer:Renderer) -> UOp:
to_program_cache: dict[tuple, UOp] = {}
def to_program(ast:UOp, renderer:Renderer) -> UOp:
config = (NOOPT, DEVECTORIZE, EMULATED_DTYPES, NOLOCALS, USE_TC, IMAGE, DISABLE_FAST_IDIV, TRANSCENDENTAL, ALLOW_TF32)
config = (NOOPT, EMULATED_DTYPES, NOLOCALS, USE_TC, IMAGE, DISABLE_FAST_IDIV, TRANSCENDENTAL, ALLOW_TF32)
key = (ast.key, type(renderer), renderer.target, *[x.value for x in config])
if (prg:=to_program_cache.get(key)) is None: to_program_cache[key] = prg = do_to_program(ast, renderer)
return prg

View File

@@ -261,13 +261,13 @@ devectorize_buf_and_index = PatternMatcher([
no_vectorized_index),
])
devectorize = PatternMatcher([
devectorize_alu = PatternMatcher([
# CAST after AFTER
(UPat(Ops.CAST, name="c").f(Ops.AFTER, allow_any_len=True, name="a"), lambda c,a: c.src[0].after(*a.src[1:]).cast(c.dtype)),
# no ALU on vectorized dtypes
(UPat((*GroupOp.ALU, Ops.CAST, Ops.BITCAST), name="alu"), no_vectorized_alu),
(UPat(Ops.WMMA, name="wmma"), no_vectorized_wmma),
])+devectorize_buf_and_index
])
pm_render = PatternMatcher([
# for rendering, we use explicit VECTORIZE

View File

@@ -239,7 +239,7 @@ USE_TC, TC_SELECT, TC_OPT = ContextVar("TC", 1), ContextVar("TC_SELECT", -1), Co
TRANSCENDENTAL, NOLOCALS = ContextVar("TRANSCENDENTAL", 1), ContextVar("NOLOCALS", 0)
SPLIT_REDUCEOP, NO_MEMORY_PLANNER, LRU = ContextVar("SPLIT_REDUCEOP", 1), ContextVar("NO_MEMORY_PLANNER", 0), ContextVar("LRU", 1)
RING, ALL2ALL, ALLREDUCE_CAST = ContextVar("RING", 1), ContextVar("ALL2ALL", 0), ContextVar("ALLREDUCE_CAST", 1)
CACHELEVEL, IGNORE_BEAM_CACHE, DEVECTORIZE = ContextVar("CACHELEVEL", 2), ContextVar("IGNORE_BEAM_CACHE", 0), ContextVar("DEVECTORIZE", 1)
CACHELEVEL, IGNORE_BEAM_CACHE = ContextVar("CACHELEVEL", 2), ContextVar("IGNORE_BEAM_CACHE", 0)
VALIDATE_WITH_CPU = ContextVar("VALIDATE_WITH_CPU", 0)
# TODO: this is broken for some indexing
DISABLE_FAST_IDIV = ContextVar("DISABLE_FAST_IDIV", 1)