remove DEVECTORIZE (#16290)

* remove DEVECTORIZE * fully remove DEVECTORIZE
2026-06-08 05:54:59 +08:00 · 2026-05-20 13:25:49 -07:00
parent 825f30bf18
commit 58d58c1659
10 changed files with 16 additions and 39 deletions
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -561,27 +561,6 @@ jobs:

 # ****** Feature Tests ******

-  testdevectorize:
-    name: Linux (devectorize)
-    runs-on: ubuntu-24.04
-    timeout-minutes: 15
-    steps:
-    - name: Checkout Code
-      uses: actions/checkout@v6
-    - name: Setup Environment
-      uses: ./.github/actions/setup-tinygrad
-      with:
-        key: devectorize-minimal
-        deps: testing_unit
-        pydeps: "pillow"
-        llvm: "true"
-    - name: Test LLVM=1 DEVECTORIZE=0
-      run: DEV=CPU:LLVM DEVECTORIZE=0 python3 -m pytest -n auto test/test_tiny.py test/backend/test_ops.py
-    - name: Test LLVM=1 DEVECTORIZE=0 for model
-      run: DEV=CPU:LLVM DEVECTORIZE=0 python3 test/models/test_efficientnet.py
-    - name: Test DEV=CPU DEVECTORIZE=0
-      run: DEV=CPU DEVECTORIZE=0 python3 -m pytest -n auto test/test_tiny.py test/backend/test_ops.py
-
  testdsp:
    name: Linux (DSP)
    runs-on: ubuntu-24.04
--- a/examples/anthropic_challenge.py
+++ b/examples/anthropic_challenge.py
@@ -174,7 +174,7 @@ if __name__ == "__main__":
  # *** render to device ***

  from tinygrad.codegen import to_program
-  with Context(PCONTIG=2, DEVECTORIZE=2, SPEC=0):
+  with Context(PCONTIG=2, SPEC=0):
    out = tree_traversal(forest_t, val_t, height, rounds)
    sink = out.schedule_linear().src[-1].src[0]
    prg = to_program(sink, VLIWRenderer())
--- a/extra/gemm/amd_uop_matmul.py
+++ b/extra/gemm/amd_uop_matmul.py
@@ -122,7 +122,7 @@ def eval_custom_matmul(fxn, dt=dtypes.float):
  with Context(DEBUG=0): Tensor.realize(a, b)

  ets = []
-  with Context(DEBUG=max(2, DEBUG.value), DEVECTORIZE=2 if dt == dtypes.half else 0):
+  with Context(DEBUG=max(2, DEBUG.value)):
    for _ in range(NUM_RUNS):
      GlobalCounters.reset()
      tst = Tensor.custom_kernel(c, a, b, fxn=fxn)[0].realize()
--- a/extra/gemm/mi350x_uop_matmul.py
+++ b/extra/gemm/mi350x_uop_matmul.py
@@ -218,7 +218,7 @@ if __name__ == "__main__":
  ref.realize()

  GlobalCounters.reset()
-  with Context(DEBUG=max(2, DEBUG.value), DEVECTORIZE=2):
+  with Context(DEBUG=max(2, DEBUG.value)):
    tst = Tensor.custom_kernel(c, a, b, fxn=custom_gemm)[0]
    tst.realize()
  print(f"{(N*M*K*2 / GlobalCounters.time_sum_s)*1e-12:.2f} REAL TFLOPS")
--- a/extra/gemm/mi350x_uop_matmul_2.py
+++ b/extra/gemm/mi350x_uop_matmul_2.py
@@ -127,7 +127,7 @@ if __name__ == "__main__":


  GlobalCounters.reset()
-  with Context(DEBUG=max(2, DEBUG.value), DEVECTORIZE=2):
+  with Context(DEBUG=max(2, DEBUG.value)):
    tst = Tensor.custom_kernel(c, a, b, fxn=custom_gemm)[0]
    tst.realize()
  print(f"{(N*M*K*2 / GlobalCounters.time_sum_s)*1e-12:.2f} REAL TFLOPS")
--- a/test/backend/test_ops.py
+++ b/test/backend/test_ops.py
@@ -2449,7 +2449,7 @@ class TestOps(unittest.TestCase):

  @unittest.skipUnless(Device.DEFAULT == "CPU" and DEV.renderer == "LLVM", "DEVECTORIZE=0 only for LLVM")
  def test_strided_conv2d_simple_vec(self):
-    with Context(DEVECTORIZE=0): self.test_strided_conv2d_simple()
+    self.test_strided_conv2d_simple()

  @slow_test
  def test_strided_conv2d(self):
--- a/test/null/test_linearizer_rewrite.py
+++ b/test/null/test_linearizer_rewrite.py
@@ -8,7 +8,7 @@ class TestLinearizerRewrite(unittest.TestCase):
  def test_reduction(self):
    t = Tensor.ones((64,64), device="NULL").contiguous().realize()
    out = (t*2).sum(axis=1)
-    with Context(SPLIT_REDUCEOP=0, DEVECTORIZE=0):
+    with Context(SPLIT_REDUCEOP=0):
      si = out.schedule_linear().src[-1]
      opts_to_apply = []
      opts_to_apply.append(Opt(OptOps.UPCAST, 0, 4))
@@ -19,7 +19,7 @@ class TestLinearizerRewrite(unittest.TestCase):

  def test_arange(self):
    out = Tensor.arange(32, device="NULL")
-    with Context(SPLIT_REDUCEOP=0, DEVECTORIZE=0):
+    with Context(SPLIT_REDUCEOP=0):
      si = out.schedule_linear().src[-1]
      opts_to_apply = []
      opts_to_apply.append(Opt(OptOps.UPCAST, 0, 4))
--- a/tinygrad/codegen/init.py
+++ b/tinygrad/codegen/init.py
@@ -1,7 +1,7 @@
 from typing import cast
 from dataclasses import replace
 import itertools
-from tinygrad.helpers import DISABLE_FAST_IDIV, DEVECTORIZE, TRANSCENDENTAL, SPEC, DEBUG, VIZ, IMAGE, NOOPT, EMULATED_DTYPES, NOLOCALS, USE_TC
+from tinygrad.helpers import DISABLE_FAST_IDIV, TRANSCENDENTAL, SPEC, DEBUG, VIZ, IMAGE, NOOPT, EMULATED_DTYPES, NOLOCALS, USE_TC
 from tinygrad.helpers import ALLOW_TF32, TracingKey, Context, panic
 from tinygrad.uop.ops import PatternMatcher, graph_rewrite, UOp, pm_lower_index_dtype, Ops, UPat, track_rewrites, KernelInfo, ProgramInfo
 from tinygrad.uop.render import pyrender
@@ -15,7 +15,7 @@ from tinygrad.codegen.gpudims import pm_add_gpudims
 from tinygrad.uop.symbolic import sym, symbolic_simple, gep_pushing, symbolic, pm_move_where_on_load
 from tinygrad.uop.decompositions import get_late_rewrite_patterns, get_transcendental_patterns, pm_dtype_decomps
 from tinygrad.codegen.late.expander import expander, pm_pre_expander, pm_group_for_reduce
-from tinygrad.codegen.late.devectorizer import load_store_folding, load_store_indexing, devectorize, pm_reduce, \
+from tinygrad.codegen.late.devectorizer import load_store_folding, load_store_indexing, devectorize_buf_and_index, devectorize_alu, pm_reduce, \
  ReduceContext, correct_load_store, pm_render, pm_add_loads, pm_make_images
 from tinygrad.codegen.opt.postrange import apply_opts
 from tinygrad.codegen.late.gater import pm_move_gates_from_index
@@ -74,11 +74,9 @@ def full_rewrite_to_sink(ast:UOp, ren:Renderer, optimize:bool=True) -> UOp:
  if IMAGE and ren.target.device in {"QCOM", "CL", "PYTHON", "NULL"}:
    sink = graph_rewrite(sink, pm_make_images, name="create image buffers", bottom_up=True, ctx=ren.target.arch)

-  # devectorize (TODO: does this need opts?)
-  if DEVECTORIZE >= 2: pm_devectorize = sym+load_store_folding+load_store_indexing
-  elif DEVECTORIZE: pm_devectorize = sym+devectorize+load_store_folding+correct_load_store+load_store_indexing
-  else: pm_devectorize = sym+load_store_folding+correct_load_store+load_store_indexing
-  if DEVECTORIZE >= 0: sink = graph_rewrite(sink, pm_devectorize, ctx=ren, name="devectorize")
+  # devectorize
+  sink = graph_rewrite(sink, sym+devectorize_alu+devectorize_buf_and_index+load_store_folding+correct_load_store+load_store_indexing,
+                       ctx=ren, name="devectorize")

  # lower the index dtype to a concrete int
  sink = graph_rewrite(sink, pm_lower_index_dtype+load_store_indexing+gep_pushing, name="lower all index dtypes")
@@ -204,7 +202,7 @@ def do_to_program(ast:UOp, renderer:Renderer) -> UOp:

 to_program_cache: dict[tuple, UOp] = {}
 def to_program(ast:UOp, renderer:Renderer) -> UOp:
-  config = (NOOPT, DEVECTORIZE, EMULATED_DTYPES, NOLOCALS, USE_TC, IMAGE, DISABLE_FAST_IDIV, TRANSCENDENTAL, ALLOW_TF32)
+  config = (NOOPT, EMULATED_DTYPES, NOLOCALS, USE_TC, IMAGE, DISABLE_FAST_IDIV, TRANSCENDENTAL, ALLOW_TF32)
  key = (ast.key, type(renderer), renderer.target, *[x.value for x in config])
  if (prg:=to_program_cache.get(key)) is None: to_program_cache[key] = prg = do_to_program(ast, renderer)
  return prg
--- a/tinygrad/codegen/late/devectorizer.py
+++ b/tinygrad/codegen/late/devectorizer.py
@@ -261,13 +261,13 @@ devectorize_buf_and_index = PatternMatcher([
   no_vectorized_index),
 ])

-devectorize = PatternMatcher([
+devectorize_alu = PatternMatcher([
  # CAST after AFTER
  (UPat(Ops.CAST, name="c").f(Ops.AFTER, allow_any_len=True, name="a"), lambda c,a: c.src[0].after(*a.src[1:]).cast(c.dtype)),
  # no ALU on vectorized dtypes
  (UPat((*GroupOp.ALU, Ops.CAST, Ops.BITCAST), name="alu"), no_vectorized_alu),
  (UPat(Ops.WMMA, name="wmma"), no_vectorized_wmma),
-])+devectorize_buf_and_index
+])

 pm_render = PatternMatcher([
  # for rendering, we use explicit VECTORIZE
--- a/tinygrad/helpers.py
+++ b/tinygrad/helpers.py
@@ -239,7 +239,7 @@ USE_TC, TC_SELECT, TC_OPT = ContextVar("TC", 1), ContextVar("TC_SELECT", -1), Co
 TRANSCENDENTAL, NOLOCALS = ContextVar("TRANSCENDENTAL", 1), ContextVar("NOLOCALS", 0)
 SPLIT_REDUCEOP, NO_MEMORY_PLANNER, LRU = ContextVar("SPLIT_REDUCEOP", 1), ContextVar("NO_MEMORY_PLANNER", 0), ContextVar("LRU", 1)
 RING, ALL2ALL, ALLREDUCE_CAST = ContextVar("RING", 1), ContextVar("ALL2ALL", 0), ContextVar("ALLREDUCE_CAST", 1)
-CACHELEVEL, IGNORE_BEAM_CACHE, DEVECTORIZE = ContextVar("CACHELEVEL", 2), ContextVar("IGNORE_BEAM_CACHE", 0), ContextVar("DEVECTORIZE", 1)
+CACHELEVEL, IGNORE_BEAM_CACHE = ContextVar("CACHELEVEL", 2), ContextVar("IGNORE_BEAM_CACHE", 0)
 VALIDATE_WITH_CPU = ContextVar("VALIDATE_WITH_CPU", 0)
 # TODO: this is broken for some indexing
 DISABLE_FAST_IDIV = ContextVar("DISABLE_FAST_IDIV", 1)