From 58d58c165980cb67844d5a024d4eca4597f92d4f Mon Sep 17 00:00:00 2001 From: George Hotz <72895+geohot@users.noreply.github.com> Date: Wed, 20 May 2026 13:25:49 -0700 Subject: [PATCH] remove DEVECTORIZE (#16290) * remove DEVECTORIZE * fully remove DEVECTORIZE --- .github/workflows/test.yml | 21 --------------------- examples/anthropic_challenge.py | 2 +- extra/gemm/amd_uop_matmul.py | 2 +- extra/gemm/mi350x_uop_matmul.py | 2 +- extra/gemm/mi350x_uop_matmul_2.py | 2 +- test/backend/test_ops.py | 2 +- test/null/test_linearizer_rewrite.py | 4 ++-- tinygrad/codegen/__init__.py | 14 ++++++-------- tinygrad/codegen/late/devectorizer.py | 4 ++-- tinygrad/helpers.py | 2 +- 10 files changed, 16 insertions(+), 39 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index ff2f800ba0..d3a010d957 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -561,27 +561,6 @@ jobs: # ****** Feature Tests ****** - testdevectorize: - name: Linux (devectorize) - runs-on: ubuntu-24.04 - timeout-minutes: 15 - steps: - - name: Checkout Code - uses: actions/checkout@v6 - - name: Setup Environment - uses: ./.github/actions/setup-tinygrad - with: - key: devectorize-minimal - deps: testing_unit - pydeps: "pillow" - llvm: "true" - - name: Test LLVM=1 DEVECTORIZE=0 - run: DEV=CPU:LLVM DEVECTORIZE=0 python3 -m pytest -n auto test/test_tiny.py test/backend/test_ops.py - - name: Test LLVM=1 DEVECTORIZE=0 for model - run: DEV=CPU:LLVM DEVECTORIZE=0 python3 test/models/test_efficientnet.py - - name: Test DEV=CPU DEVECTORIZE=0 - run: DEV=CPU DEVECTORIZE=0 python3 -m pytest -n auto test/test_tiny.py test/backend/test_ops.py - testdsp: name: Linux (DSP) runs-on: ubuntu-24.04 diff --git a/examples/anthropic_challenge.py b/examples/anthropic_challenge.py index 0dafd21992..a9adea334c 100644 --- a/examples/anthropic_challenge.py +++ b/examples/anthropic_challenge.py @@ -174,7 +174,7 @@ if __name__ == "__main__": # *** render to device *** from tinygrad.codegen import to_program - with Context(PCONTIG=2, DEVECTORIZE=2, SPEC=0): + with Context(PCONTIG=2, SPEC=0): out = tree_traversal(forest_t, val_t, height, rounds) sink = out.schedule_linear().src[-1].src[0] prg = to_program(sink, VLIWRenderer()) diff --git a/extra/gemm/amd_uop_matmul.py b/extra/gemm/amd_uop_matmul.py index 9deb336eac..e83233b04a 100644 --- a/extra/gemm/amd_uop_matmul.py +++ b/extra/gemm/amd_uop_matmul.py @@ -122,7 +122,7 @@ def eval_custom_matmul(fxn, dt=dtypes.float): with Context(DEBUG=0): Tensor.realize(a, b) ets = [] - with Context(DEBUG=max(2, DEBUG.value), DEVECTORIZE=2 if dt == dtypes.half else 0): + with Context(DEBUG=max(2, DEBUG.value)): for _ in range(NUM_RUNS): GlobalCounters.reset() tst = Tensor.custom_kernel(c, a, b, fxn=fxn)[0].realize() diff --git a/extra/gemm/mi350x_uop_matmul.py b/extra/gemm/mi350x_uop_matmul.py index 8aba22eb19..b8a82b784a 100644 --- a/extra/gemm/mi350x_uop_matmul.py +++ b/extra/gemm/mi350x_uop_matmul.py @@ -218,7 +218,7 @@ if __name__ == "__main__": ref.realize() GlobalCounters.reset() - with Context(DEBUG=max(2, DEBUG.value), DEVECTORIZE=2): + with Context(DEBUG=max(2, DEBUG.value)): tst = Tensor.custom_kernel(c, a, b, fxn=custom_gemm)[0] tst.realize() print(f"{(N*M*K*2 / GlobalCounters.time_sum_s)*1e-12:.2f} REAL TFLOPS") diff --git a/extra/gemm/mi350x_uop_matmul_2.py b/extra/gemm/mi350x_uop_matmul_2.py index b42c6d40e0..41c51f6e74 100644 --- a/extra/gemm/mi350x_uop_matmul_2.py +++ b/extra/gemm/mi350x_uop_matmul_2.py @@ -127,7 +127,7 @@ if __name__ == "__main__": GlobalCounters.reset() - with Context(DEBUG=max(2, DEBUG.value), DEVECTORIZE=2): + with Context(DEBUG=max(2, DEBUG.value)): tst = Tensor.custom_kernel(c, a, b, fxn=custom_gemm)[0] tst.realize() print(f"{(N*M*K*2 / GlobalCounters.time_sum_s)*1e-12:.2f} REAL TFLOPS") diff --git a/test/backend/test_ops.py b/test/backend/test_ops.py index 8fdbf728b3..234cdaf0bb 100644 --- a/test/backend/test_ops.py +++ b/test/backend/test_ops.py @@ -2449,7 +2449,7 @@ class TestOps(unittest.TestCase): @unittest.skipUnless(Device.DEFAULT == "CPU" and DEV.renderer == "LLVM", "DEVECTORIZE=0 only for LLVM") def test_strided_conv2d_simple_vec(self): - with Context(DEVECTORIZE=0): self.test_strided_conv2d_simple() + self.test_strided_conv2d_simple() @slow_test def test_strided_conv2d(self): diff --git a/test/null/test_linearizer_rewrite.py b/test/null/test_linearizer_rewrite.py index 4faeac0b48..d0b8e39b5e 100644 --- a/test/null/test_linearizer_rewrite.py +++ b/test/null/test_linearizer_rewrite.py @@ -8,7 +8,7 @@ class TestLinearizerRewrite(unittest.TestCase): def test_reduction(self): t = Tensor.ones((64,64), device="NULL").contiguous().realize() out = (t*2).sum(axis=1) - with Context(SPLIT_REDUCEOP=0, DEVECTORIZE=0): + with Context(SPLIT_REDUCEOP=0): si = out.schedule_linear().src[-1] opts_to_apply = [] opts_to_apply.append(Opt(OptOps.UPCAST, 0, 4)) @@ -19,7 +19,7 @@ class TestLinearizerRewrite(unittest.TestCase): def test_arange(self): out = Tensor.arange(32, device="NULL") - with Context(SPLIT_REDUCEOP=0, DEVECTORIZE=0): + with Context(SPLIT_REDUCEOP=0): si = out.schedule_linear().src[-1] opts_to_apply = [] opts_to_apply.append(Opt(OptOps.UPCAST, 0, 4)) diff --git a/tinygrad/codegen/__init__.py b/tinygrad/codegen/__init__.py index b2205b315e..4dc6e34a06 100644 --- a/tinygrad/codegen/__init__.py +++ b/tinygrad/codegen/__init__.py @@ -1,7 +1,7 @@ from typing import cast from dataclasses import replace import itertools -from tinygrad.helpers import DISABLE_FAST_IDIV, DEVECTORIZE, TRANSCENDENTAL, SPEC, DEBUG, VIZ, IMAGE, NOOPT, EMULATED_DTYPES, NOLOCALS, USE_TC +from tinygrad.helpers import DISABLE_FAST_IDIV, TRANSCENDENTAL, SPEC, DEBUG, VIZ, IMAGE, NOOPT, EMULATED_DTYPES, NOLOCALS, USE_TC from tinygrad.helpers import ALLOW_TF32, TracingKey, Context, panic from tinygrad.uop.ops import PatternMatcher, graph_rewrite, UOp, pm_lower_index_dtype, Ops, UPat, track_rewrites, KernelInfo, ProgramInfo from tinygrad.uop.render import pyrender @@ -15,7 +15,7 @@ from tinygrad.codegen.gpudims import pm_add_gpudims from tinygrad.uop.symbolic import sym, symbolic_simple, gep_pushing, symbolic, pm_move_where_on_load from tinygrad.uop.decompositions import get_late_rewrite_patterns, get_transcendental_patterns, pm_dtype_decomps from tinygrad.codegen.late.expander import expander, pm_pre_expander, pm_group_for_reduce -from tinygrad.codegen.late.devectorizer import load_store_folding, load_store_indexing, devectorize, pm_reduce, \ +from tinygrad.codegen.late.devectorizer import load_store_folding, load_store_indexing, devectorize_buf_and_index, devectorize_alu, pm_reduce, \ ReduceContext, correct_load_store, pm_render, pm_add_loads, pm_make_images from tinygrad.codegen.opt.postrange import apply_opts from tinygrad.codegen.late.gater import pm_move_gates_from_index @@ -74,11 +74,9 @@ def full_rewrite_to_sink(ast:UOp, ren:Renderer, optimize:bool=True) -> UOp: if IMAGE and ren.target.device in {"QCOM", "CL", "PYTHON", "NULL"}: sink = graph_rewrite(sink, pm_make_images, name="create image buffers", bottom_up=True, ctx=ren.target.arch) - # devectorize (TODO: does this need opts?) - if DEVECTORIZE >= 2: pm_devectorize = sym+load_store_folding+load_store_indexing - elif DEVECTORIZE: pm_devectorize = sym+devectorize+load_store_folding+correct_load_store+load_store_indexing - else: pm_devectorize = sym+load_store_folding+correct_load_store+load_store_indexing - if DEVECTORIZE >= 0: sink = graph_rewrite(sink, pm_devectorize, ctx=ren, name="devectorize") + # devectorize + sink = graph_rewrite(sink, sym+devectorize_alu+devectorize_buf_and_index+load_store_folding+correct_load_store+load_store_indexing, + ctx=ren, name="devectorize") # lower the index dtype to a concrete int sink = graph_rewrite(sink, pm_lower_index_dtype+load_store_indexing+gep_pushing, name="lower all index dtypes") @@ -204,7 +202,7 @@ def do_to_program(ast:UOp, renderer:Renderer) -> UOp: to_program_cache: dict[tuple, UOp] = {} def to_program(ast:UOp, renderer:Renderer) -> UOp: - config = (NOOPT, DEVECTORIZE, EMULATED_DTYPES, NOLOCALS, USE_TC, IMAGE, DISABLE_FAST_IDIV, TRANSCENDENTAL, ALLOW_TF32) + config = (NOOPT, EMULATED_DTYPES, NOLOCALS, USE_TC, IMAGE, DISABLE_FAST_IDIV, TRANSCENDENTAL, ALLOW_TF32) key = (ast.key, type(renderer), renderer.target, *[x.value for x in config]) if (prg:=to_program_cache.get(key)) is None: to_program_cache[key] = prg = do_to_program(ast, renderer) return prg diff --git a/tinygrad/codegen/late/devectorizer.py b/tinygrad/codegen/late/devectorizer.py index 72f5f3a28e..871c22e758 100644 --- a/tinygrad/codegen/late/devectorizer.py +++ b/tinygrad/codegen/late/devectorizer.py @@ -261,13 +261,13 @@ devectorize_buf_and_index = PatternMatcher([ no_vectorized_index), ]) -devectorize = PatternMatcher([ +devectorize_alu = PatternMatcher([ # CAST after AFTER (UPat(Ops.CAST, name="c").f(Ops.AFTER, allow_any_len=True, name="a"), lambda c,a: c.src[0].after(*a.src[1:]).cast(c.dtype)), # no ALU on vectorized dtypes (UPat((*GroupOp.ALU, Ops.CAST, Ops.BITCAST), name="alu"), no_vectorized_alu), (UPat(Ops.WMMA, name="wmma"), no_vectorized_wmma), -])+devectorize_buf_and_index +]) pm_render = PatternMatcher([ # for rendering, we use explicit VECTORIZE diff --git a/tinygrad/helpers.py b/tinygrad/helpers.py index 68b4a90c8d..1fd3b7e498 100644 --- a/tinygrad/helpers.py +++ b/tinygrad/helpers.py @@ -239,7 +239,7 @@ USE_TC, TC_SELECT, TC_OPT = ContextVar("TC", 1), ContextVar("TC_SELECT", -1), Co TRANSCENDENTAL, NOLOCALS = ContextVar("TRANSCENDENTAL", 1), ContextVar("NOLOCALS", 0) SPLIT_REDUCEOP, NO_MEMORY_PLANNER, LRU = ContextVar("SPLIT_REDUCEOP", 1), ContextVar("NO_MEMORY_PLANNER", 0), ContextVar("LRU", 1) RING, ALL2ALL, ALLREDUCE_CAST = ContextVar("RING", 1), ContextVar("ALL2ALL", 0), ContextVar("ALLREDUCE_CAST", 1) -CACHELEVEL, IGNORE_BEAM_CACHE, DEVECTORIZE = ContextVar("CACHELEVEL", 2), ContextVar("IGNORE_BEAM_CACHE", 0), ContextVar("DEVECTORIZE", 1) +CACHELEVEL, IGNORE_BEAM_CACHE = ContextVar("CACHELEVEL", 2), ContextVar("IGNORE_BEAM_CACHE", 0) VALIDATE_WITH_CPU = ContextVar("VALIDATE_WITH_CPU", 0) # TODO: this is broken for some indexing DISABLE_FAST_IDIV = ContextVar("DISABLE_FAST_IDIV", 1)