From 58d58c165980cb67844d5a024d4eca4597f92d4f Mon Sep 17 00:00:00 2001
From: George Hotz <72895+geohot@users.noreply.github.com>
Date: Wed, 20 May 2026 13:25:49 -0700
Subject: [PATCH] remove DEVECTORIZE (#16290)

* remove DEVECTORIZE

* fully remove DEVECTORIZE
---
 .github/workflows/test.yml            | 21 ---------------------
 examples/anthropic_challenge.py       |  2 +-
 extra/gemm/amd_uop_matmul.py          |  2 +-
 extra/gemm/mi350x_uop_matmul.py       |  2 +-
 extra/gemm/mi350x_uop_matmul_2.py     |  2 +-
 test/backend/test_ops.py              |  2 +-
 test/null/test_linearizer_rewrite.py  |  4 ++--
 tinygrad/codegen/__init__.py          | 14 ++++++--------
 tinygrad/codegen/late/devectorizer.py |  4 ++--
 tinygrad/helpers.py                   |  2 +-
 10 files changed, 16 insertions(+), 39 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index ff2f800ba0..d3a010d957 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -561,27 +561,6 @@ jobs:
 
 # ****** Feature Tests ******
 
-  testdevectorize:
-    name: Linux (devectorize)
-    runs-on: ubuntu-24.04
-    timeout-minutes: 15
-    steps:
-    - name: Checkout Code
-      uses: actions/checkout@v6
-    - name: Setup Environment
-      uses: ./.github/actions/setup-tinygrad
-      with:
-        key: devectorize-minimal
-        deps: testing_unit
-        pydeps: "pillow"
-        llvm: "true"
-    - name: Test LLVM=1 DEVECTORIZE=0
-      run: DEV=CPU:LLVM DEVECTORIZE=0 python3 -m pytest -n auto test/test_tiny.py test/backend/test_ops.py
-    - name: Test LLVM=1 DEVECTORIZE=0 for model
-      run: DEV=CPU:LLVM DEVECTORIZE=0 python3 test/models/test_efficientnet.py
-    - name: Test DEV=CPU DEVECTORIZE=0
-      run: DEV=CPU DEVECTORIZE=0 python3 -m pytest -n auto test/test_tiny.py test/backend/test_ops.py
-
   testdsp:
     name: Linux (DSP)
     runs-on: ubuntu-24.04
diff --git a/examples/anthropic_challenge.py b/examples/anthropic_challenge.py
index 0dafd21992..a9adea334c 100644
--- a/examples/anthropic_challenge.py
+++ b/examples/anthropic_challenge.py
@@ -174,7 +174,7 @@ if __name__ == "__main__":
   # *** render to device ***
 
   from tinygrad.codegen import to_program
-  with Context(PCONTIG=2, DEVECTORIZE=2, SPEC=0):
+  with Context(PCONTIG=2, SPEC=0):
     out = tree_traversal(forest_t, val_t, height, rounds)
     sink = out.schedule_linear().src[-1].src[0]
     prg = to_program(sink, VLIWRenderer())
diff --git a/extra/gemm/amd_uop_matmul.py b/extra/gemm/amd_uop_matmul.py
index 9deb336eac..e83233b04a 100644
--- a/extra/gemm/amd_uop_matmul.py
+++ b/extra/gemm/amd_uop_matmul.py
@@ -122,7 +122,7 @@ def eval_custom_matmul(fxn, dt=dtypes.float):
   with Context(DEBUG=0): Tensor.realize(a, b)
 
   ets = []
-  with Context(DEBUG=max(2, DEBUG.value), DEVECTORIZE=2 if dt == dtypes.half else 0):
+  with Context(DEBUG=max(2, DEBUG.value)):
     for _ in range(NUM_RUNS):
       GlobalCounters.reset()
       tst = Tensor.custom_kernel(c, a, b, fxn=fxn)[0].realize()
diff --git a/extra/gemm/mi350x_uop_matmul.py b/extra/gemm/mi350x_uop_matmul.py
index 8aba22eb19..b8a82b784a 100644
--- a/extra/gemm/mi350x_uop_matmul.py
+++ b/extra/gemm/mi350x_uop_matmul.py
@@ -218,7 +218,7 @@ if __name__ == "__main__":
   ref.realize()
 
   GlobalCounters.reset()
-  with Context(DEBUG=max(2, DEBUG.value), DEVECTORIZE=2):
+  with Context(DEBUG=max(2, DEBUG.value)):
     tst = Tensor.custom_kernel(c, a, b, fxn=custom_gemm)[0]
     tst.realize()
   print(f"{(N*M*K*2 / GlobalCounters.time_sum_s)*1e-12:.2f} REAL TFLOPS")
diff --git a/extra/gemm/mi350x_uop_matmul_2.py b/extra/gemm/mi350x_uop_matmul_2.py
index b42c6d40e0..41c51f6e74 100644
--- a/extra/gemm/mi350x_uop_matmul_2.py
+++ b/extra/gemm/mi350x_uop_matmul_2.py
@@ -127,7 +127,7 @@ if __name__ == "__main__":
 
 
   GlobalCounters.reset()
-  with Context(DEBUG=max(2, DEBUG.value), DEVECTORIZE=2):
+  with Context(DEBUG=max(2, DEBUG.value)):
     tst = Tensor.custom_kernel(c, a, b, fxn=custom_gemm)[0]
     tst.realize()
   print(f"{(N*M*K*2 / GlobalCounters.time_sum_s)*1e-12:.2f} REAL TFLOPS")
diff --git a/test/backend/test_ops.py b/test/backend/test_ops.py
index 8fdbf728b3..234cdaf0bb 100644
--- a/test/backend/test_ops.py
+++ b/test/backend/test_ops.py
@@ -2449,7 +2449,7 @@ class TestOps(unittest.TestCase):
 
   @unittest.skipUnless(Device.DEFAULT == "CPU" and DEV.renderer == "LLVM", "DEVECTORIZE=0 only for LLVM")
   def test_strided_conv2d_simple_vec(self):
-    with Context(DEVECTORIZE=0): self.test_strided_conv2d_simple()
+    self.test_strided_conv2d_simple()
 
   @slow_test
   def test_strided_conv2d(self):
diff --git a/test/null/test_linearizer_rewrite.py b/test/null/test_linearizer_rewrite.py
index 4faeac0b48..d0b8e39b5e 100644
--- a/test/null/test_linearizer_rewrite.py
+++ b/test/null/test_linearizer_rewrite.py
@@ -8,7 +8,7 @@ class TestLinearizerRewrite(unittest.TestCase):
   def test_reduction(self):
     t = Tensor.ones((64,64), device="NULL").contiguous().realize()
     out = (t*2).sum(axis=1)
-    with Context(SPLIT_REDUCEOP=0, DEVECTORIZE=0):
+    with Context(SPLIT_REDUCEOP=0):
       si = out.schedule_linear().src[-1]
       opts_to_apply = []
       opts_to_apply.append(Opt(OptOps.UPCAST, 0, 4))
@@ -19,7 +19,7 @@ class TestLinearizerRewrite(unittest.TestCase):
 
   def test_arange(self):
     out = Tensor.arange(32, device="NULL")
-    with Context(SPLIT_REDUCEOP=0, DEVECTORIZE=0):
+    with Context(SPLIT_REDUCEOP=0):
       si = out.schedule_linear().src[-1]
       opts_to_apply = []
       opts_to_apply.append(Opt(OptOps.UPCAST, 0, 4))
diff --git a/tinygrad/codegen/__init__.py b/tinygrad/codegen/__init__.py
index b2205b315e..4dc6e34a06 100644
--- a/tinygrad/codegen/__init__.py
+++ b/tinygrad/codegen/__init__.py
@@ -1,7 +1,7 @@
 from typing import cast
 from dataclasses import replace
 import itertools
-from tinygrad.helpers import DISABLE_FAST_IDIV, DEVECTORIZE, TRANSCENDENTAL, SPEC, DEBUG, VIZ, IMAGE, NOOPT, EMULATED_DTYPES, NOLOCALS, USE_TC
+from tinygrad.helpers import DISABLE_FAST_IDIV, TRANSCENDENTAL, SPEC, DEBUG, VIZ, IMAGE, NOOPT, EMULATED_DTYPES, NOLOCALS, USE_TC
 from tinygrad.helpers import ALLOW_TF32, TracingKey, Context, panic
 from tinygrad.uop.ops import PatternMatcher, graph_rewrite, UOp, pm_lower_index_dtype, Ops, UPat, track_rewrites, KernelInfo, ProgramInfo
 from tinygrad.uop.render import pyrender
@@ -15,7 +15,7 @@ from tinygrad.codegen.gpudims import pm_add_gpudims
 from tinygrad.uop.symbolic import sym, symbolic_simple, gep_pushing, symbolic, pm_move_where_on_load
 from tinygrad.uop.decompositions import get_late_rewrite_patterns, get_transcendental_patterns, pm_dtype_decomps
 from tinygrad.codegen.late.expander import expander, pm_pre_expander, pm_group_for_reduce
-from tinygrad.codegen.late.devectorizer import load_store_folding, load_store_indexing, devectorize, pm_reduce, \
+from tinygrad.codegen.late.devectorizer import load_store_folding, load_store_indexing, devectorize_buf_and_index, devectorize_alu, pm_reduce, \
   ReduceContext, correct_load_store, pm_render, pm_add_loads, pm_make_images
 from tinygrad.codegen.opt.postrange import apply_opts
 from tinygrad.codegen.late.gater import pm_move_gates_from_index
@@ -74,11 +74,9 @@ def full_rewrite_to_sink(ast:UOp, ren:Renderer, optimize:bool=True) -> UOp:
   if IMAGE and ren.target.device in {"QCOM", "CL", "PYTHON", "NULL"}:
     sink = graph_rewrite(sink, pm_make_images, name="create image buffers", bottom_up=True, ctx=ren.target.arch)
 
-  # devectorize (TODO: does this need opts?)
-  if DEVECTORIZE >= 2: pm_devectorize = sym+load_store_folding+load_store_indexing
-  elif DEVECTORIZE: pm_devectorize = sym+devectorize+load_store_folding+correct_load_store+load_store_indexing
-  else: pm_devectorize = sym+load_store_folding+correct_load_store+load_store_indexing
-  if DEVECTORIZE >= 0: sink = graph_rewrite(sink, pm_devectorize, ctx=ren, name="devectorize")
+  # devectorize
+  sink = graph_rewrite(sink, sym+devectorize_alu+devectorize_buf_and_index+load_store_folding+correct_load_store+load_store_indexing,
+                       ctx=ren, name="devectorize")
 
   # lower the index dtype to a concrete int
   sink = graph_rewrite(sink, pm_lower_index_dtype+load_store_indexing+gep_pushing, name="lower all index dtypes")
@@ -204,7 +202,7 @@ def do_to_program(ast:UOp, renderer:Renderer) -> UOp:
 
 to_program_cache: dict[tuple, UOp] = {}
 def to_program(ast:UOp, renderer:Renderer) -> UOp:
-  config = (NOOPT, DEVECTORIZE, EMULATED_DTYPES, NOLOCALS, USE_TC, IMAGE, DISABLE_FAST_IDIV, TRANSCENDENTAL, ALLOW_TF32)
+  config = (NOOPT, EMULATED_DTYPES, NOLOCALS, USE_TC, IMAGE, DISABLE_FAST_IDIV, TRANSCENDENTAL, ALLOW_TF32)
   key = (ast.key, type(renderer), renderer.target, *[x.value for x in config])
   if (prg:=to_program_cache.get(key)) is None: to_program_cache[key] = prg = do_to_program(ast, renderer)
   return prg
diff --git a/tinygrad/codegen/late/devectorizer.py b/tinygrad/codegen/late/devectorizer.py
index 72f5f3a28e..871c22e758 100644
--- a/tinygrad/codegen/late/devectorizer.py
+++ b/tinygrad/codegen/late/devectorizer.py
@@ -261,13 +261,13 @@ devectorize_buf_and_index = PatternMatcher([
    no_vectorized_index),
 ])
 
-devectorize = PatternMatcher([
+devectorize_alu = PatternMatcher([
   # CAST after AFTER
   (UPat(Ops.CAST, name="c").f(Ops.AFTER, allow_any_len=True, name="a"), lambda c,a: c.src[0].after(*a.src[1:]).cast(c.dtype)),
   # no ALU on vectorized dtypes
   (UPat((*GroupOp.ALU, Ops.CAST, Ops.BITCAST), name="alu"), no_vectorized_alu),
   (UPat(Ops.WMMA, name="wmma"), no_vectorized_wmma),
-])+devectorize_buf_and_index
+])
 
 pm_render = PatternMatcher([
   # for rendering, we use explicit VECTORIZE
diff --git a/tinygrad/helpers.py b/tinygrad/helpers.py
index 68b4a90c8d..1fd3b7e498 100644
--- a/tinygrad/helpers.py
+++ b/tinygrad/helpers.py
@@ -239,7 +239,7 @@ USE_TC, TC_SELECT, TC_OPT = ContextVar("TC", 1), ContextVar("TC_SELECT", -1), Co
 TRANSCENDENTAL, NOLOCALS = ContextVar("TRANSCENDENTAL", 1), ContextVar("NOLOCALS", 0)
 SPLIT_REDUCEOP, NO_MEMORY_PLANNER, LRU = ContextVar("SPLIT_REDUCEOP", 1), ContextVar("NO_MEMORY_PLANNER", 0), ContextVar("LRU", 1)
 RING, ALL2ALL, ALLREDUCE_CAST = ContextVar("RING", 1), ContextVar("ALL2ALL", 0), ContextVar("ALLREDUCE_CAST", 1)
-CACHELEVEL, IGNORE_BEAM_CACHE, DEVECTORIZE = ContextVar("CACHELEVEL", 2), ContextVar("IGNORE_BEAM_CACHE", 0), ContextVar("DEVECTORIZE", 1)
+CACHELEVEL, IGNORE_BEAM_CACHE = ContextVar("CACHELEVEL", 2), ContextVar("IGNORE_BEAM_CACHE", 0)
 VALIDATE_WITH_CPU = ContextVar("VALIDATE_WITH_CPU", 0)
 # TODO: this is broken for some indexing
 DISABLE_FAST_IDIV = ContextVar("DISABLE_FAST_IDIV", 1)