mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-06-08 05:54:59 +08:00
remove DEVECTORIZE (#16290)
* remove DEVECTORIZE * fully remove DEVECTORIZE
This commit is contained in:
21
.github/workflows/test.yml
vendored
21
.github/workflows/test.yml
vendored
@@ -561,27 +561,6 @@ jobs:
|
||||
|
||||
# ****** Feature Tests ******
|
||||
|
||||
testdevectorize:
|
||||
name: Linux (devectorize)
|
||||
runs-on: ubuntu-24.04
|
||||
timeout-minutes: 15
|
||||
steps:
|
||||
- name: Checkout Code
|
||||
uses: actions/checkout@v6
|
||||
- name: Setup Environment
|
||||
uses: ./.github/actions/setup-tinygrad
|
||||
with:
|
||||
key: devectorize-minimal
|
||||
deps: testing_unit
|
||||
pydeps: "pillow"
|
||||
llvm: "true"
|
||||
- name: Test LLVM=1 DEVECTORIZE=0
|
||||
run: DEV=CPU:LLVM DEVECTORIZE=0 python3 -m pytest -n auto test/test_tiny.py test/backend/test_ops.py
|
||||
- name: Test LLVM=1 DEVECTORIZE=0 for model
|
||||
run: DEV=CPU:LLVM DEVECTORIZE=0 python3 test/models/test_efficientnet.py
|
||||
- name: Test DEV=CPU DEVECTORIZE=0
|
||||
run: DEV=CPU DEVECTORIZE=0 python3 -m pytest -n auto test/test_tiny.py test/backend/test_ops.py
|
||||
|
||||
testdsp:
|
||||
name: Linux (DSP)
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
@@ -174,7 +174,7 @@ if __name__ == "__main__":
|
||||
# *** render to device ***
|
||||
|
||||
from tinygrad.codegen import to_program
|
||||
with Context(PCONTIG=2, DEVECTORIZE=2, SPEC=0):
|
||||
with Context(PCONTIG=2, SPEC=0):
|
||||
out = tree_traversal(forest_t, val_t, height, rounds)
|
||||
sink = out.schedule_linear().src[-1].src[0]
|
||||
prg = to_program(sink, VLIWRenderer())
|
||||
|
||||
@@ -122,7 +122,7 @@ def eval_custom_matmul(fxn, dt=dtypes.float):
|
||||
with Context(DEBUG=0): Tensor.realize(a, b)
|
||||
|
||||
ets = []
|
||||
with Context(DEBUG=max(2, DEBUG.value), DEVECTORIZE=2 if dt == dtypes.half else 0):
|
||||
with Context(DEBUG=max(2, DEBUG.value)):
|
||||
for _ in range(NUM_RUNS):
|
||||
GlobalCounters.reset()
|
||||
tst = Tensor.custom_kernel(c, a, b, fxn=fxn)[0].realize()
|
||||
|
||||
@@ -218,7 +218,7 @@ if __name__ == "__main__":
|
||||
ref.realize()
|
||||
|
||||
GlobalCounters.reset()
|
||||
with Context(DEBUG=max(2, DEBUG.value), DEVECTORIZE=2):
|
||||
with Context(DEBUG=max(2, DEBUG.value)):
|
||||
tst = Tensor.custom_kernel(c, a, b, fxn=custom_gemm)[0]
|
||||
tst.realize()
|
||||
print(f"{(N*M*K*2 / GlobalCounters.time_sum_s)*1e-12:.2f} REAL TFLOPS")
|
||||
|
||||
@@ -127,7 +127,7 @@ if __name__ == "__main__":
|
||||
|
||||
|
||||
GlobalCounters.reset()
|
||||
with Context(DEBUG=max(2, DEBUG.value), DEVECTORIZE=2):
|
||||
with Context(DEBUG=max(2, DEBUG.value)):
|
||||
tst = Tensor.custom_kernel(c, a, b, fxn=custom_gemm)[0]
|
||||
tst.realize()
|
||||
print(f"{(N*M*K*2 / GlobalCounters.time_sum_s)*1e-12:.2f} REAL TFLOPS")
|
||||
|
||||
@@ -2449,7 +2449,7 @@ class TestOps(unittest.TestCase):
|
||||
|
||||
@unittest.skipUnless(Device.DEFAULT == "CPU" and DEV.renderer == "LLVM", "DEVECTORIZE=0 only for LLVM")
|
||||
def test_strided_conv2d_simple_vec(self):
|
||||
with Context(DEVECTORIZE=0): self.test_strided_conv2d_simple()
|
||||
self.test_strided_conv2d_simple()
|
||||
|
||||
@slow_test
|
||||
def test_strided_conv2d(self):
|
||||
|
||||
@@ -8,7 +8,7 @@ class TestLinearizerRewrite(unittest.TestCase):
|
||||
def test_reduction(self):
|
||||
t = Tensor.ones((64,64), device="NULL").contiguous().realize()
|
||||
out = (t*2).sum(axis=1)
|
||||
with Context(SPLIT_REDUCEOP=0, DEVECTORIZE=0):
|
||||
with Context(SPLIT_REDUCEOP=0):
|
||||
si = out.schedule_linear().src[-1]
|
||||
opts_to_apply = []
|
||||
opts_to_apply.append(Opt(OptOps.UPCAST, 0, 4))
|
||||
@@ -19,7 +19,7 @@ class TestLinearizerRewrite(unittest.TestCase):
|
||||
|
||||
def test_arange(self):
|
||||
out = Tensor.arange(32, device="NULL")
|
||||
with Context(SPLIT_REDUCEOP=0, DEVECTORIZE=0):
|
||||
with Context(SPLIT_REDUCEOP=0):
|
||||
si = out.schedule_linear().src[-1]
|
||||
opts_to_apply = []
|
||||
opts_to_apply.append(Opt(OptOps.UPCAST, 0, 4))
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
from typing import cast
|
||||
from dataclasses import replace
|
||||
import itertools
|
||||
from tinygrad.helpers import DISABLE_FAST_IDIV, DEVECTORIZE, TRANSCENDENTAL, SPEC, DEBUG, VIZ, IMAGE, NOOPT, EMULATED_DTYPES, NOLOCALS, USE_TC
|
||||
from tinygrad.helpers import DISABLE_FAST_IDIV, TRANSCENDENTAL, SPEC, DEBUG, VIZ, IMAGE, NOOPT, EMULATED_DTYPES, NOLOCALS, USE_TC
|
||||
from tinygrad.helpers import ALLOW_TF32, TracingKey, Context, panic
|
||||
from tinygrad.uop.ops import PatternMatcher, graph_rewrite, UOp, pm_lower_index_dtype, Ops, UPat, track_rewrites, KernelInfo, ProgramInfo
|
||||
from tinygrad.uop.render import pyrender
|
||||
@@ -15,7 +15,7 @@ from tinygrad.codegen.gpudims import pm_add_gpudims
|
||||
from tinygrad.uop.symbolic import sym, symbolic_simple, gep_pushing, symbolic, pm_move_where_on_load
|
||||
from tinygrad.uop.decompositions import get_late_rewrite_patterns, get_transcendental_patterns, pm_dtype_decomps
|
||||
from tinygrad.codegen.late.expander import expander, pm_pre_expander, pm_group_for_reduce
|
||||
from tinygrad.codegen.late.devectorizer import load_store_folding, load_store_indexing, devectorize, pm_reduce, \
|
||||
from tinygrad.codegen.late.devectorizer import load_store_folding, load_store_indexing, devectorize_buf_and_index, devectorize_alu, pm_reduce, \
|
||||
ReduceContext, correct_load_store, pm_render, pm_add_loads, pm_make_images
|
||||
from tinygrad.codegen.opt.postrange import apply_opts
|
||||
from tinygrad.codegen.late.gater import pm_move_gates_from_index
|
||||
@@ -74,11 +74,9 @@ def full_rewrite_to_sink(ast:UOp, ren:Renderer, optimize:bool=True) -> UOp:
|
||||
if IMAGE and ren.target.device in {"QCOM", "CL", "PYTHON", "NULL"}:
|
||||
sink = graph_rewrite(sink, pm_make_images, name="create image buffers", bottom_up=True, ctx=ren.target.arch)
|
||||
|
||||
# devectorize (TODO: does this need opts?)
|
||||
if DEVECTORIZE >= 2: pm_devectorize = sym+load_store_folding+load_store_indexing
|
||||
elif DEVECTORIZE: pm_devectorize = sym+devectorize+load_store_folding+correct_load_store+load_store_indexing
|
||||
else: pm_devectorize = sym+load_store_folding+correct_load_store+load_store_indexing
|
||||
if DEVECTORIZE >= 0: sink = graph_rewrite(sink, pm_devectorize, ctx=ren, name="devectorize")
|
||||
# devectorize
|
||||
sink = graph_rewrite(sink, sym+devectorize_alu+devectorize_buf_and_index+load_store_folding+correct_load_store+load_store_indexing,
|
||||
ctx=ren, name="devectorize")
|
||||
|
||||
# lower the index dtype to a concrete int
|
||||
sink = graph_rewrite(sink, pm_lower_index_dtype+load_store_indexing+gep_pushing, name="lower all index dtypes")
|
||||
@@ -204,7 +202,7 @@ def do_to_program(ast:UOp, renderer:Renderer) -> UOp:
|
||||
|
||||
to_program_cache: dict[tuple, UOp] = {}
|
||||
def to_program(ast:UOp, renderer:Renderer) -> UOp:
|
||||
config = (NOOPT, DEVECTORIZE, EMULATED_DTYPES, NOLOCALS, USE_TC, IMAGE, DISABLE_FAST_IDIV, TRANSCENDENTAL, ALLOW_TF32)
|
||||
config = (NOOPT, EMULATED_DTYPES, NOLOCALS, USE_TC, IMAGE, DISABLE_FAST_IDIV, TRANSCENDENTAL, ALLOW_TF32)
|
||||
key = (ast.key, type(renderer), renderer.target, *[x.value for x in config])
|
||||
if (prg:=to_program_cache.get(key)) is None: to_program_cache[key] = prg = do_to_program(ast, renderer)
|
||||
return prg
|
||||
|
||||
@@ -261,13 +261,13 @@ devectorize_buf_and_index = PatternMatcher([
|
||||
no_vectorized_index),
|
||||
])
|
||||
|
||||
devectorize = PatternMatcher([
|
||||
devectorize_alu = PatternMatcher([
|
||||
# CAST after AFTER
|
||||
(UPat(Ops.CAST, name="c").f(Ops.AFTER, allow_any_len=True, name="a"), lambda c,a: c.src[0].after(*a.src[1:]).cast(c.dtype)),
|
||||
# no ALU on vectorized dtypes
|
||||
(UPat((*GroupOp.ALU, Ops.CAST, Ops.BITCAST), name="alu"), no_vectorized_alu),
|
||||
(UPat(Ops.WMMA, name="wmma"), no_vectorized_wmma),
|
||||
])+devectorize_buf_and_index
|
||||
])
|
||||
|
||||
pm_render = PatternMatcher([
|
||||
# for rendering, we use explicit VECTORIZE
|
||||
|
||||
@@ -239,7 +239,7 @@ USE_TC, TC_SELECT, TC_OPT = ContextVar("TC", 1), ContextVar("TC_SELECT", -1), Co
|
||||
TRANSCENDENTAL, NOLOCALS = ContextVar("TRANSCENDENTAL", 1), ContextVar("NOLOCALS", 0)
|
||||
SPLIT_REDUCEOP, NO_MEMORY_PLANNER, LRU = ContextVar("SPLIT_REDUCEOP", 1), ContextVar("NO_MEMORY_PLANNER", 0), ContextVar("LRU", 1)
|
||||
RING, ALL2ALL, ALLREDUCE_CAST = ContextVar("RING", 1), ContextVar("ALL2ALL", 0), ContextVar("ALLREDUCE_CAST", 1)
|
||||
CACHELEVEL, IGNORE_BEAM_CACHE, DEVECTORIZE = ContextVar("CACHELEVEL", 2), ContextVar("IGNORE_BEAM_CACHE", 0), ContextVar("DEVECTORIZE", 1)
|
||||
CACHELEVEL, IGNORE_BEAM_CACHE = ContextVar("CACHELEVEL", 2), ContextVar("IGNORE_BEAM_CACHE", 0)
|
||||
VALIDATE_WITH_CPU = ContextVar("VALIDATE_WITH_CPU", 0)
|
||||
# TODO: this is broken for some indexing
|
||||
DISABLE_FAST_IDIV = ContextVar("DISABLE_FAST_IDIV", 1)
|
||||
|
||||
Reference in New Issue
Block a user