move tests to test/backend (#14691)

* move tests to test/backend * fix imports * fix CI * revert that one * Fix formatting in README for test command
2026-06-08 05:54:59 +08:00 · 2026-02-12 11:09:44 +08:00
parent 4b5d3bda1f
commit c331798201
61 changed files with 76 additions and 90 deletions
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -106,7 +106,7 @@ jobs:
        sudo apt update || true
        sudo apt install -y --no-install-recommends ninja-build
    - name: Test one op
-      run: FORWARD_ONLY=1 TINY_BACKEND=1 python3 test/test_ops.py TestOps.test_add
+      run: FORWARD_ONLY=1 TINY_BACKEND=1 python3 test/test_tiny.py TestTiny.test_plus
    - name: Test ResNet-18
      run: DEBUG=2 python3 extra/torch_backend/example.py
    - name: custom tests
@@ -114,7 +114,7 @@ jobs:
    - name: Test one op in torch tests
      run: DEBUG=2 python3 extra/torch_backend/torch_tests.py TestTinyBackendPRIVATEUSE1.test_unary_log_tiny_float32
    - name: Test Ops with TINY_BACKEND
-      run: CPU=1 CPU_LLVM=1 LLVMOPT=0 TINY_BACKEND=1 python3 -m pytest -n auto test/test_ops.py --durations=20
+      run: CPU=1 CPU_LLVM=1 LLVMOPT=0 TINY_BACKEND=1 python3 -m pytest -n auto test/backend/test_ops.py --durations=20
    - name: Test in-place operations on views
      run: TORCH_DEBUG=1 python3 extra/torch_backend/test_inplace.py
    - name: Test multi-gpu
@@ -158,25 +158,25 @@ jobs:
        key: be-minimal
        deps: testing_unit
    - name: Test dtype with Python emulator
-      run: DEBUG=1 PYTHON=1 python3 -m pytest -n=auto test/test_dtype.py test/test_dtype_alu.py
+      run: DEBUG=1 PYTHON=1 python3 -m pytest -n=auto test/backend/test_dtype.py test/backend/test_dtype_alu.py
    - name: Test ops with Python emulator
-      run: DEBUG=2 SKIP_SLOW_TEST=1 PYTHON=1 python3 -m pytest -n=auto test/test_ops.py --durations=20
+      run: DEBUG=2 SKIP_SLOW_TEST=1 PYTHON=1 python3 -m pytest -n=auto test/backend/test_ops.py --durations=20
    - name: Test uops with Python emulator
-      run: PYTHON=1 python3 -m pytest test/test_uops.py --durations=20
+      run: PYTHON=1 python3 -m pytest test/backend/test_uops.py --durations=20
    - name: Test symbolic with Python emulator
-      run: PYTHON=1 python3 test/test_symbolic_ops.py
+      run: PYTHON=1 python3 test/backend/test_symbolic_ops.py
    - name: test_renderer_failures with Python emulator
-      run: PYTHON=1 python3 -m pytest -rA test/test_renderer_failures.py::TestRendererFailures
+      run: PYTHON=1 python3 -m pytest -rA test/backend/test_renderer_failures.py::TestRendererFailures
    - name: Test IMAGE=2 support
      run: |
-        IMAGE=2 PYTHON=1 python3 test/test_ops.py TestOps.test_gemm
-        IMAGE=2 PYTHON=1 python3 test/test_ops.py TestOps.test_simple_conv2d
+        IMAGE=2 PYTHON=1 python3 test/backend/test_ops.py TestOps.test_gemm
+        IMAGE=2 PYTHON=1 python3 test/backend/test_ops.py TestOps.test_simple_conv2d
    - name: Test emulated METAL tensor cores
      run: |
-        DEBUG=2 EMULATE=METAL FORWARD_ONLY=1 PYTHON=1 python3 test/test_ops.py TestOps.test_big_gemm
+        DEBUG=2 EMULATE=METAL FORWARD_ONLY=1 PYTHON=1 python3 test/backend/test_ops.py TestOps.test_big_gemm
        DEBUG=2 EMULATE=METAL FORWARD_ONLY=1 PYTHON=1 python3 test/opt/test_tensor_cores.py
    - name: Test emulated AMX tensor cores
-      run: DEBUG=2 AMX=1 EMULATE=AMX FORWARD_ONLY=1 PYTHON=1 python3 test/test_ops.py TestOps.test_gemm
+      run: DEBUG=2 AMX=1 EMULATE=AMX FORWARD_ONLY=1 PYTHON=1 python3 test/backend/test_ops.py TestOps.test_gemm
    - name: Test emulated AMD tensor cores
      run: |
        DEBUG=2 EMULATE=AMD FORWARD_ONLY=1 PYTHON=1 N=16 HALF=1 ACC_HALF=0 python3 ./extra/gemm/simple_matmul.py
@@ -197,9 +197,9 @@ jobs:
        DEBUG=2 EMULATE=AMD_RDNA4 FORWARD_ONLY=1 PYTHON=1 python3 test/opt/test_tensor_cores.py
    - name: Test emulated CUDA tensor cores
      run: |
-        DEBUG=2 EMULATE=CUDA FORWARD_ONLY=1 PYTHON=1 python3 test/test_ops.py TestOps.test_gemm_fp16
-        DEBUG=2 EMULATE=CUDA ALLOW_TF32=1 FORWARD_ONLY=1 PYTHON=1 python3 test/test_ops.py TestOps.test_gemm
-        DEBUG=2 EMULATE=CUDA_SM75 FORWARD_ONLY=1 PYTHON=1 python3 test/test_ops.py TestOps.test_gemm_fp16
+        DEBUG=2 EMULATE=CUDA FORWARD_ONLY=1 PYTHON=1 python3 test/backend/test_ops.py TestOps.test_gemm_fp16
+        DEBUG=2 EMULATE=CUDA ALLOW_TF32=1 FORWARD_ONLY=1 PYTHON=1 python3 test/backend/test_ops.py TestOps.test_gemm
+        DEBUG=2 EMULATE=CUDA_SM75 FORWARD_ONLY=1 PYTHON=1 python3 test/backend/test_ops.py TestOps.test_gemm_fp16
        DEBUG=2 EMULATE=CUDA_SM89 ALLOW_TF32=1 FORWARD_ONLY=1 PYTHON=1 python3 test/opt/test_tensor_cores.py
    - name: Test emulated INTEL OpenCL tensor cores
      run: DEBUG=2 EMULATE=INTEL FORWARD_ONLY=1 PYTHON=1 HALF=1 N=64 python3 ./extra/gemm/simple_matmul.py
@@ -271,7 +271,7 @@ jobs:
    - name: Run NULL backend tests
      run: NULL=1 python -m pytest -n=auto test/null/ --durations=20
    - name: Run targetted tests on NULL backend
-      run: NULL=1 python3 -m unittest test.test_multitensor.TestMultiTensor.test_data_parallel_resnet_train_step
+      run: NULL=1 python3 -m unittest test.backend.test_multitensor.TestMultiTensor.test_data_parallel_resnet_train_step
    # TODO: too slow
    # - name: Run SDXL on NULL backend
    #   run: NULL=1 DEBUG=1 python3 examples/sdxl.py --seed 0 --noshow --timing --fakeweights
@@ -316,7 +316,7 @@ jobs:
        deps: testing_unit
        python-version: '3.14'
    - name: Test SPEC=2
-      run: SPEC=2 pytest --maxfail=10 -n auto --durations=30 --ignore=test/models --ignore=test/null --ignore test/test_custom_kernel.py --ignore test/unit/test_hashing.py --timeout 60 -k "not test_setitem_big" --splits 2 --group ${{ matrix.group }}
+      run: SPEC=2 pytest --maxfail=10 -n auto --durations=30 --ignore=test/models --ignore=test/null --ignore test/backend/test_custom_kernel.py --ignore test/unit/test_hashing.py --timeout 60 -k "not test_setitem_big" --splits 2 --group ${{ matrix.group }}

  fuzzing:
    name: Fuzzing
@@ -354,7 +354,7 @@ jobs:
          opencl: 'true'
      - name: Test CL IMAGE=2 ops
        run: |
-          CL=1 IMAGE=2 python -m pytest -n=auto test/test_ops.py --durations=20
+          CL=1 IMAGE=2 python -m pytest -n=auto test/backend/test_ops.py --durations=20
        # TODO: training is broken
        # CL=1 IMAGE=2 python test/models/test_end2end.py TestEnd2End.test_linear_mnist
      - name: Run process replay tests
@@ -378,7 +378,7 @@ jobs:
      - name: Run Kernel Count Test
        run: CL=1 python -m pytest -n=auto test/external/external_test_opt.py
      - name: Run fused optimizer tests
-        run: CL=1 FUSE_OPTIM=1 python -m pytest -n=auto test/models/test_mnist.py test/test_optim.py -k "not muon"
+        run: CL=1 FUSE_OPTIM=1 python -m pytest -n=auto test/models/test_mnist.py test/backend/test_optim.py -k "not muon"
      - name: Upload artifact
        uses: actions/upload-artifact@v4
        with:
@@ -437,7 +437,7 @@ jobs:
      - name: Test Additional ONNX Ops (CPU)
        run: CPU=1 CPU_LLVM=0 python3 test/external/external_test_onnx_ops.py
      - name: Test Quantize ONNX
-        run: CPU=1 CPU_LLVM=0 python3 test/test_quantize_onnx.py
+        run: CPU=1 CPU_LLVM=0 python3 test/backend/test_quantize_onnx.py
      - name: Run process replay tests
        uses: ./.github/actions/process-replay

@@ -551,11 +551,11 @@ jobs:
        pydeps: "pillow"
        llvm: "true"
    - name: Test LLVM=1 DEVECTORIZE=0
-      run: CPU=1 CPU_LLVM=1 DEVECTORIZE=0 python3 -m pytest -n auto test/test_tiny.py test/test_ops.py
+      run: CPU=1 CPU_LLVM=1 DEVECTORIZE=0 python3 -m pytest -n auto test/test_tiny.py test/backend/test_ops.py
    - name: Test LLVM=1 DEVECTORIZE=0 for model
      run: CPU=1 CPU_LLVM=1 DEVECTORIZE=0 python3 test/models/test_efficientnet.py
    - name: Test CPU=1 DEVECTORIZE=0
-      run: CPU=1 CPU_LLVM=0 DEVECTORIZE=0 python3 -m pytest -n auto test/test_tiny.py test/test_ops.py
+      run: CPU=1 CPU_LLVM=0 DEVECTORIZE=0 python3 -m pytest -n auto test/test_tiny.py test/backend/test_ops.py

  testdsp:
    name: Linux (DSP)
@@ -587,9 +587,9 @@ jobs:
    - name: Run test_tiny on DSP
      run: DEBUG=2 DSP=1 python test/test_tiny.py
    - name: Test transcendentals
-      run: CC=clang-20 DEBUG=2 DSP=1 python test/test_transcendental.py TestTranscendentalVectorized
+      run: CC=clang-20 DEBUG=2 DSP=1 python test/backend/test_transcendental.py TestTranscendentalVectorized
    - name: Test quantize onnx
-      run: DEBUG=2 DSP=1 python3 test/test_quantize_onnx.py
+      run: DEBUG=2 DSP=1 python3 test/backend/test_quantize_onnx.py

  testwebgpu:
    name: Linux (WebGPU)
@@ -608,7 +608,7 @@ jobs:
    - name: Check Device.DEFAULT (WEBGPU) and print some source
      run: |
        WEBGPU=1 python -c "from tinygrad import Device; assert Device.DEFAULT == 'WEBGPU', Device.DEFAULT"
-        WEBGPU=1 DEBUG=4 FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add
+        WEBGPU=1 DEBUG=4 FORWARD_ONLY=1 python3 test/test_tiny.py TestTiny.test_plus
    - name: Run selected webgpu tests
      run: |
          WEBGPU=1 WEBGPU_BACKEND="WGPUBackendType_Vulkan" python3 -m pytest -n=auto test/ --ignore=test/models --ignore=test/unit --ignore=test/null --durations=20
@@ -642,19 +642,19 @@ jobs:
      - name: Check Device.DEFAULT and print some source
        run: |
          python3 -c "from tinygrad import Device; assert Device.DEFAULT in ['AMD'], Device.DEFAULT"
-          DEBUG=5 FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add
+          DEBUG=5 FORWARD_ONLY=1 python3 test/test_tiny.py TestTiny.test_plus
      - name: Run LLVM test
        if: matrix.backend=='amdllvm'
        run: python test/device/test_amd_llvm.py
      - name: Run pytest (amd)
-        run: python -m pytest -n=auto test/test_ops.py test/test_dtype.py test/test_dtype_alu.py test/test_linearizer.py test/test_randomness.py test/test_jit.py test/test_graph.py test/test_multitensor.py test/device/test_hcq.py test/testextra/test_cfg_viz.py --durations=20
+        run: python -m pytest -n=auto test/backend/test_ops.py test/backend/test_dtype.py test/backend/test_dtype_alu.py test/backend/test_linearizer.py test/backend/test_randomness.py test/backend/test_jit.py test/backend/test_graph.py test/backend/test_multitensor.py test/device/test_hcq.py test/testextra/test_cfg_viz.py --durations=20
      - name: Run pytest (amd)
        run: python -m pytest test/external/external_test_am.py --durations=20
      - name: Run TRANSCENDENTAL math
-        run: TRANSCENDENTAL=2 python -m pytest -n=auto test/test_ops.py::TestOps::test_sin test/test_ops.py::TestOps::test_cos test/test_ops.py::TestOps::test_tan test/test_ops.py::TestOps::test_exp test/test_ops.py::TestOps::test_log --durations=20
+        run: TRANSCENDENTAL=2 python -m pytest -n=auto test/backend/test_ops.py::TestOps::test_sin test/backend/test_ops.py::TestOps::test_cos test/backend/test_ops.py::TestOps::test_tan test/backend/test_ops.py::TestOps::test_exp test/backend/test_ops.py::TestOps::test_log --durations=20
      - name: Run TestOps.test_add with SQTT
        run: |
-          VIZ=-2 DEBUG=5 python3 test/test_ops.py TestOps.test_add
+          VIZ=-2 DEBUG=5 python3 test/backend/test_ops.py TestOps.test_add
          extra/sqtt/rgptool.py create "/tmp/profile.pkl.$USER" -o /tmp/gpu0.rgp
      - name: Run AMD emulated mmapeak on NULL backend
        env:
@@ -700,12 +700,12 @@ jobs:
      - name: Run RDNA3 emulator tests (AMD_LLVM=1)
        run: AMD_LLVM=1 python -m pytest -n=auto extra/assembly/amd/ --durations 20
      - name: Run RDNA3 dtype tests
-        run: AMD_LLVM=0 pytest -n=auto test/test_dtype_alu.py test/test_dtype.py --durations 20
+        run: AMD_LLVM=0 pytest -n=auto test/backend/test_dtype_alu.py test/backend/test_dtype.py --durations 20
      - name: Run RDNA3 dtype tests (AMD_LLVM=1)
-        run: AMD_LLVM=1 pytest -n=auto test/test_dtype_alu.py test/test_dtype.py --durations 20
+        run: AMD_LLVM=1 pytest -n=auto test/backend/test_dtype_alu.py test/backend/test_dtype.py --durations 20
      # TODO: run all once emulator is faster
      - name: Run RDNA3 ops tests
-        run: SKIP_SLOW_TEST=1 AMD_LLVM=0 pytest -n=auto test/test_ops.py -k "test_sparse_categorical_crossentropy or test_tril or test_nonzero or test_softmax_argmax" --durations 20
+        run: SKIP_SLOW_TEST=1 AMD_LLVM=0 pytest -n=auto test/backend/test_ops.py -k "test_sparse_categorical_crossentropy or test_tril or test_nonzero or test_softmax_argmax" --durations 20
      - name: Run RDNA4 emulator tests
        run: MOCKGPU_ARCH=rdna4 python -m pytest test/test_tiny.py -v --durations 20

@@ -736,12 +736,12 @@ jobs:
      - name: Check Device.DEFAULT and print some source
        run: |
          python3 -c "from tinygrad import Device; assert Device.DEFAULT in ['CUDA','NV'], Device.DEFAULT"
-          DEBUG=5 FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add
+          DEBUG=5 FORWARD_ONLY=1 python3 test/test_tiny.py TestTiny.test_plus
      - name: Run pytest (cuda)
        # skip multitensor because it's slow
-        run: python -m pytest -n=auto test/ --ignore=test/models --ignore=test/unit --ignore=test/null --ignore test/test_multitensor.py --durations=20
+        run: python -m pytest -n=auto test/ --ignore=test/models --ignore=test/unit --ignore=test/null --ignore test/backend/test_multitensor.py --durations=20
      - name: Run TestOps.test_add with PMA
-        run: VIZ=-1 PMA=1 DEBUG=5 python3 test/test_ops.py TestOps.test_add
+        run: VIZ=-1 PMA=1 DEBUG=5 python3 test/backend/test_ops.py TestOps.test_add
      - name: Run process replay tests
        uses: ./.github/actions/process-replay

@@ -770,11 +770,11 @@ jobs:
      - name: Check Device.DEFAULT and print some source
        run: |
          python3 -c "from tinygrad import Device; assert Device.DEFAULT in ['CPU','CL'], Device.DEFAULT"
-          DEBUG=5 FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add
+          DEBUG=5 FORWARD_ONLY=1 python3 test/test_tiny.py TestTiny.test_plus
      - name: Run pytest (${{ matrix.backend }})
        run: python -m pytest -n=auto test/ --ignore=test/models --ignore=test/unit --ignore=test/null --durations=20
      - name: Run TRANSCENDENTAL math
-        run: TRANSCENDENTAL=2 python -m pytest -n=auto test/test_ops.py::TestOps::test_sin test/test_ops.py::TestOps::test_cos test/test_ops.py::TestOps::test_tan test/test_ops.py::TestOps::test_exp test/test_ops.py::TestOps::test_log --durations=20
+        run: TRANSCENDENTAL=2 python -m pytest -n=auto test/backend/test_ops.py::TestOps::test_sin test/backend/test_ops.py::TestOps::test_cos test/backend/test_ops.py::TestOps::test_tan test/backend/test_ops.py::TestOps::test_exp test/backend/test_ops.py::TestOps::test_log --durations=20
      - name: Run process replay tests
        uses: ./.github/actions/process-replay

@@ -804,15 +804,15 @@ jobs:
    - name: Run ONNX
      run: METAL=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
    - name: Test tensor core ops (fake)
-      run: METAL=1 DEBUG=3 TC=2 python test/test_ops.py TestOps.test_gemm
+      run: METAL=1 DEBUG=3 TC=2 python test/backend/test_ops.py TestOps.test_gemm
    - name: Test tensor core ops (real)
-      run: METAL=1 DEBUG=3 python test/test_ops.py TestOps.test_big_gemm
+      run: METAL=1 DEBUG=3 python test/backend/test_ops.py TestOps.test_big_gemm
    - name: Test Beam Search
      run: METAL=1 IGNORE_BEAM_CACHE=1 python3 -m pytest extra/optimization/test_beam_search.py
    #- name: Fuzz Test linearizer
    #  run: METAL=1 DEPTH=4 FUZZ_N=50 FUZZ_MAX_SIZE=1000000 python test/external/fuzz_linearizer.py
    - name: Run TRANSCENDENTAL math
-      run: METAL=1 TRANSCENDENTAL=2 python -m pytest -n=auto test/test_ops.py::TestOps::test_sin test/test_ops.py::TestOps::test_cos test/test_ops.py::TestOps::test_tan test/test_ops.py::TestOps::test_exp test/test_ops.py::TestOps::test_log --durations=20
+      run: METAL=1 TRANSCENDENTAL=2 python -m pytest -n=auto test/backend/test_ops.py::TestOps::test_sin test/backend/test_ops.py::TestOps::test_cos test/backend/test_ops.py::TestOps::test_tan test/backend/test_ops.py::TestOps::test_exp test/backend/test_ops.py::TestOps::test_log --durations=20
    - name: Run pytest (amd)
      env:
        MOCKGPU: 1
@@ -854,7 +854,7 @@ jobs:
        deps: testing
        webgpu: 'true'
    - name: Test infinity math in WGSL
-      run: WEBGPU=1 python -m pytest -n=auto test/test_renderer_failures.py::TestWGSLFailures::test_multiply_infinity --durations=20
+      run: WEBGPU=1 python -m pytest -n=auto test/backend/test_renderer_failures.py::TestWGSLFailures::test_multiply_infinity --durations=20
    - name: Build WEBGPU Efficientnet
      run: WEBGPU=1 WEBGPU_BACKEND="WGPUBackendType_Metal" python3 -m examples.compile_efficientnet
    - name: Clean npm cache
@@ -944,7 +944,7 @@ jobs:
        shell: bash
        run: |
          python -c "from tinygrad import Device; assert Device.DEFAULT == {'LLVM':'CPU'}.get(x:='${{ matrix.backend }}'.upper(), x), Device.DEFAULT"
-          python -m pytest -n=auto test/test_tiny.py test/test_ops.py --durations=20
+          python -m pytest -n=auto test/test_tiny.py test/backend/test_ops.py --durations=20

 # ****** Compile-only Tests ******

@@ -973,5 +973,5 @@ jobs:
        shell: bash
        run: |
          python -c "from tinygrad import Device; assert Device.DEFAULT == 'NULL'"
-          DEBUG=4 python3 test/test_ops.py TestOps.test_add
-          python -m pytest -n=auto test/test_ops.py --durations=20
+          DEBUG=4 python3 test/backend/test_ops.py TestOps.test_add
+          python -m pytest -n=auto test/backend/test_ops.py --durations=20
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -28,7 +28,7 @@ repos:
        pass_filenames: false
      - id: tests
        name: comprehensive test suite
-        entry: env OMP_NUM_THREADS=1 SKIP_SLOW_TEST=1 PYTHONPATH="." python3 -m pytest -n=6 test/test_ops.py test/test_schedule.py test/unit/test_assign.py test/test_tensor.py test/test_jit.py test/unit/test_schedule_cache.py test/null/test_pattern_matcher.py test/null/test_uop_symbolic.py test/unit/test_helpers.py
+        entry: env OMP_NUM_THREADS=1 SKIP_SLOW_TEST=1 PYTHONPATH="." python3 -m pytest -n=6 test/backend/test_ops.py test/backend/test_schedule.py test/unit/test_assign.py test/backend/test_tensor.py test/backend/test_jit.py test/unit/test_schedule_cache.py test/null/test_pattern_matcher.py test/null/test_uop_symbolic.py test/unit/test_helpers.py
        language: system
        always_run: true
        pass_filenames: false
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -41,10 +41,10 @@ Schedules are cached by graph structure. BIND nodes (variables with bound values
 python -m pytest test/unit/test_schedule_cache.py -xvs

 # Run with timeout
-python -m pytest test/test_symbolic_ops.py -x --timeout=60
+python -m pytest test/backend/test_symbolic_ops.py -x --timeout=60

 # Debug with print
-DEBUG=2 python -m pytest test/test_schedule.py::test_name -xvs
+DEBUG=2 python -m pytest test/backend/test_schedule.py::test_name -xvs

 # Visualize UOp graphs
 VIZ=1 python -c "from tinygrad import Tensor; Tensor.ones(10).sum().realize()"
--- a/README.md
+++ b/README.md
@@ -192,7 +192,7 @@ For more examples on how to run the full test suite please refer to the [CI work
 Some examples of running tests locally:
 ```sh
 python3 -m pip install -e '.[testing]'  # install extra deps for testing
-python3 test/test_ops.py                # just the ops tests
+python3 test/backend/test_ops.py        # just the ops tests
 python3 -m pytest test/                 # whole test suite
 ```

--- a/extra/optimization/generate_dataset.sh
+++ b/extra/optimization/generate_dataset.sh
@@ -7,7 +7,7 @@ export CAPTURE_PROCESS_REPLAY=1
 rm "$LOGOPS" 2>/dev/null || true
 test/external/process_replay/reset.py

-CI=1 python3 -m pytest -n=auto test/test_ops.py test/test_nn.py test/unit/test_winograd.py test/null/test_real_world.py --durations=20
+CI=1 python3 -m pytest -n=auto test/backend/test_ops.py test/backend/test_nn.py test/unit/test_winograd.py test/null/test_real_world.py --durations=20
 CL=1 python3 -m pytest test/test_tiny.py

 # extract, sort and uniq
--- a/extra/sqtt/examples/generate_examples.py
+++ b/extra/sqtt/examples/generate_examples.py
@@ -6,7 +6,7 @@ EXAMPLES_DIR = Path(__file__).parent
 PROFILE_PATH = Path(temp("profile.pkl", append_user=True))

 EXAMPLES = [
-  "test.test_custom_kernel.TestCustomKernel.test_empty",
+  "test.backend.test_custom_kernel.TestCustomKernel.test_empty",
  "test.test_tiny.TestTiny.test_plus",
  "test.test_tiny.TestTiny.test_gemm",
 ]
--- a/extra/test_mi350.sh
+++ b/extra/test_mi350.sh
@@ -1,7 +1,7 @@
 #!/bin/bash

-AMD=1 AMD_LLVM=1 python -m pytest -n=1 test/test_ops.py test/test_dtype.py test/test_dtype_alu.py test/test_linearizer.py test/test_randomness.py test/test_jit.py test/test_graph.py test/test_multitensor.py --durations=20
-AMD=1 AMD_LLVM=0 python -m pytest -n=1 test/test_ops.py test/test_dtype.py test/test_dtype_alu.py test/test_linearizer.py test/test_randomness.py test/test_jit.py test/test_graph.py test/test_multitensor.py --durations=20
+AMD=1 AMD_LLVM=1 python -m pytest -n=1 test/backend/test_ops.py test/backend/test_dtype.py test/backend/test_dtype_alu.py test/backend/test_linearizer.py test/backend/test_randomness.py test/backend/test_jit.py test/backend/test_graph.py test/backend/test_multitensor.py --durations=20
+AMD=1 AMD_LLVM=0 python -m pytest -n=1 test/backend/test_ops.py test/backend/test_dtype.py test/backend/test_dtype_alu.py test/backend/test_linearizer.py test/backend/test_randomness.py test/backend/test_jit.py test/backend/test_graph.py test/backend/test_multitensor.py --durations=20

 CNT=1 AMD_LLVM=0 DEBUG=2 FP8E4M3=1 HALF=0 BFLOAT16=0 SHOULD_USE_TC=1 python extra/gemm/simple_matmul.py
 CNT=1 AMD_LLVM=0 DEBUG=2 FP8E4M3=0 HALF=1 BFLOAT16=0 SHOULD_USE_TC=1 python extra/gemm/simple_matmul.py
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -122,7 +122,7 @@ do_not_mutate = [
  "tinygrad/helpers.py",
  "tinygrad/tensor.py",
 ]
-tests_dir = ["test/test_tiny.py", "test/test_ops.py"]
+tests_dir = ["test/test_tiny.py", "test/backend/test_ops.py"]
 debug = true


--- a/test/Dockerfile
+++ b/test/Dockerfile
@@ -1,12 +0,0 @@
-FROM ubuntu:22.04
-
-# Install python3.10, and pip3
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    python3.10 \
-    python3-pip \
-    && rm -rf /var/lib/apt/lists/*
-
-# Install python dependencies
-COPY . ./tinygrad
-WORKDIR tinygrad
-RUN pip install -e .
--- a/test/README
+++ b/test/README
@@ -0,0 +1,5 @@
+Three groups of tests run in CI
+
+backend -- tests that run on each backend
+null -- tests that don't require any backend
+unit -- tests that only run on a single backend in CI
--- a/test/backend/init.py
+++ b/test/backend/init.py
--- a/test/backend/test_arange.py
+++ b/test/backend/test_arange.py
--- a/test/backend/test_const_folding.py
+++ b/test/backend/test_const_folding.py
--- a/test/backend/test_custom_kernel.py
+++ b/test/backend/test_custom_kernel.py
--- a/test/backend/test_dtype.py
+++ b/test/backend/test_dtype.py
--- a/test/backend/test_dtype_alu.py
+++ b/test/backend/test_dtype_alu.py
--- a/test/backend/test_edgecases.py
+++ b/test/backend/test_edgecases.py
--- a/test/backend/test_graph.py
+++ b/test/backend/test_graph.py
--- a/test/backend/test_image_dtype.py
+++ b/test/backend/test_image_dtype.py
--- a/test/backend/test_interop.py
+++ b/test/backend/test_interop.py
--- a/test/backend/test_jit.py
+++ b/test/backend/test_jit.py
--- a/test/backend/test_jit_cases.py
+++ b/test/backend/test_jit_cases.py
--- a/test/backend/test_jit_footguns.py
+++ b/test/backend/test_jit_footguns.py
--- a/test/backend/test_kernel_cache.py
+++ b/test/backend/test_kernel_cache.py
--- a/test/backend/test_linearizer.py
+++ b/test/backend/test_linearizer.py
--- a/test/backend/test_linearizer_dumb.py
+++ b/test/backend/test_linearizer_dumb.py
--- a/test/backend/test_multitensor.py
+++ b/test/backend/test_multitensor.py
--- a/test/backend/test_nn.py
+++ b/test/backend/test_nn.py
--- a/test/backend/test_ops.py
+++ b/test/backend/test_ops.py
--- a/test/backend/test_opt_gemm.py
+++ b/test/backend/test_opt_gemm.py
--- a/test/backend/test_optim.py
+++ b/test/backend/test_optim.py
--- a/test/backend/test_outerworld.py
+++ b/test/backend/test_outerworld.py
--- a/test/backend/test_outerworld_range.py
+++ b/test/backend/test_outerworld_range.py
--- a/test/backend/test_pickle.py
+++ b/test/backend/test_pickle.py
--- a/test/backend/test_profiler.py
+++ b/test/backend/test_profiler.py
@@ -198,7 +198,7 @@ class TestProfiler(unittest.TestCase):
  @unittest.skip("this test is flaky")
  @unittest.skipUnless(Device[Device.DEFAULT].graph is not None, "graph support required")
  def test_graph(self):
-    from test.test_graph import helper_alloc_rawbuffer, helper_exec_op, helper_test_graphs
+    from test.backend.test_graph import helper_alloc_rawbuffer, helper_exec_op, helper_test_graphs
    device = TestProfiler.d0.device
    bufs = [helper_alloc_rawbuffer(device, fill=True) for _ in range(5)]
    graphs = [[helper_exec_op(device, bufs[0], [bufs[1], bufs[2]]), helper_exec_op(device, bufs[0], [bufs[3], bufs[4]]),]]
--- a/test/backend/test_quantize_onnx.py
+++ b/test/backend/test_quantize_onnx.py
--- a/test/backend/test_randomness.py
+++ b/test/backend/test_randomness.py
--- a/test/backend/test_rangeify.py
+++ b/test/backend/test_rangeify.py
--- a/test/backend/test_renderer_failures.py
+++ b/test/backend/test_renderer_failures.py
--- a/test/backend/test_schedule.py
+++ b/test/backend/test_schedule.py
--- a/test/backend/test_setitem.py
+++ b/test/backend/test_setitem.py
--- a/test/backend/test_softmax_fusion.py
+++ b/test/backend/test_softmax_fusion.py
--- a/test/backend/test_stunning.py
+++ b/test/backend/test_stunning.py
--- a/test/backend/test_subbuffer.py
+++ b/test/backend/test_subbuffer.py
--- a/test/backend/test_symbolic_jit.py
+++ b/test/backend/test_symbolic_jit.py
--- a/test/backend/test_symbolic_ops.py
+++ b/test/backend/test_symbolic_ops.py
--- a/test/backend/test_tensor.py
+++ b/test/backend/test_tensor.py
--- a/test/backend/test_tensor_variable.py
+++ b/test/backend/test_tensor_variable.py
--- a/test/backend/test_to_numpy.py
+++ b/test/backend/test_to_numpy.py
--- a/test/backend/test_transcendental.py
+++ b/test/backend/test_transcendental.py
@@ -2,8 +2,8 @@ import unittest
 from tinygrad import Tensor, Device, dtypes
 from tinygrad.tensor import _to_np_dtype
 from tinygrad.helpers import Context, getenv, CI, OSX
-from test.test_schedule import check_schedule
-from test.test_dtype_alu import ht, dtypes_float
+from test.backend.test_schedule import check_schedule
+from test.backend.test_dtype_alu import ht, dtypes_float
 from tinygrad.device import is_dtype_supported
 import numpy as np
 import math
--- a/test/backend/test_uops.py
+++ b/test/backend/test_uops.py
@@ -12,16 +12,9 @@ from tinygrad.engine.schedule import ExecItem
 from tinygrad.device import is_dtype_supported
 from tinygrad.codegen.opt import Opt, OptOps
 from tinygrad.renderer.ptx import PTXRenderer
-from test.helpers import get_uops
+from test.helpers import to_uops_list
 from dataclasses import replace

-def to_uops_list(u:list[UOp], ren=None) -> list[UOp]:
-  sink = UOp.group(*u)
-  for r in sink.ranges: sink = sink.end(r)
-  ret = get_uops(sink.sink(arg=KernelInfo(opts_to_apply=())), ren)
-  assert ret[-1].op is Ops.SINK
-  return ret
-
 def _uops_to_prg(uops_list):
  prg = get_program(UOp.sink(*uops_list, arg=KernelInfo()), Device[Device.DEFAULT].renderer)
  return CompiledRunner(replace(prg, device=Device.DEFAULT))
--- a/test/backend/test_zero_copy.py
+++ b/test/backend/test_zero_copy.py
--- a/test/external/external_benchmark_bert_softmax.py
+++ b/test/external/external_benchmark_bert_softmax.py
@@ -2,7 +2,7 @@ from tinygrad import Tensor, dtypes, GlobalCounters
 dtypes.default_float = dtypes.float16
 from tinygrad.dtype import to_dtype
 from tinygrad.helpers import getenv
-from test.test_softmax_fusion import single_kernel_softmax
+from test.backend.test_softmax_fusion import single_kernel_softmax

 if __name__ == "__main__":
  # softmax in bert layers
--- a/test/external/process_replay/local.sh
+++ b/test/external/process_replay/local.sh
@@ -3,7 +3,7 @@
 set -e
 HEAD=$(git rev-parse --abbrev-ref HEAD)
 python test/external/process_replay/reset.py
-CAPTURE_PROCESS_REPLAY=1 python test/test_ops.py TestOps.test_add
+CAPTURE_PROCESS_REPLAY=1 python test/backend/test_ops.py TestOps.test_add
 git checkout master
 git checkout $HEAD -- test/external/process_replay/process_replay.py
 ASSERT_PROCESS_REPLAY=${ASSERT_PROCESS_REPLAY:-1} python test/external/process_replay/process_replay.py
--- a/test/helpers.py
+++ b/test/helpers.py
@@ -67,6 +67,13 @@ def eval_uop(uop:UOp, inputs:list[tuple[DType, list[Any]]]|None=None):
  prog(out_buf:=allocator.alloc(uop.dtype.itemsize), *bufs)
  return out_buf.cast(uop.dtype.fmt or "").tolist()[0]

+def to_uops_list(u:list[UOp], ren=None) -> list[UOp]:
+  sink = UOp.group(*u)
+  for r in sink.ranges: sink = sink.end(r)
+  ret = get_uops(sink.sink(arg=KernelInfo(opts_to_apply=())), ren)
+  assert ret[-1].op is Ops.SINK
+  return ret
+
 def not_support_multi_device():
  # CL and CUDA don't support multi device if in CI
  return CI and Device.DEFAULT in ("CL", "CUDA")
--- a/test/null/test_uop_graph.py
+++ b/test/null/test_uop_graph.py
@@ -5,7 +5,7 @@ from tinygrad.helpers import DEBUG, Context
 from tinygrad.uop.ops import Ops, UOp, UPat, PatternMatcher, track_rewrites, graph_rewrite, GroupOp, AxisType
 from tinygrad.uop.symbolic import sym
 from tinygrad.codegen.late.expander import expander
-from test.test_uops import to_uops_list
+from test.helpers import to_uops_list

 simple_pm = PatternMatcher([
  (UPat.cvar('x', dtypes.int), lambda x: UOp.const(dtypes.float, 1.0) + UOp.const(dtypes.float, 2.0)),
--- a/test/null/test_uops.py
+++ b/test/null/test_uops.py
@@ -5,17 +5,10 @@ from tinygrad.tensor import Tensor
 from tinygrad.helpers import Timing, Context
 from tinygrad.dtype import dtypes, ConstFloat  # noqa: F401
 from tinygrad.device import Device
-from tinygrad.uop.ops import Ops, UOp, UPat, KernelInfo, exec_alu
+from tinygrad.uop.ops import Ops, UOp, UPat, exec_alu
 from tinygrad.uop.spec import shared_spec
 from tinygrad.uop.symbolic import sym
-from test.helpers import get_uops
-
-def to_uops_list(u:list[UOp], ren=None) -> list[UOp]:
-  sink = UOp.group(*u)
-  for r in sink.ranges: sink = sink.end(r)
-  ret = get_uops(sink.sink(arg=KernelInfo(opts_to_apply=())), ren)
-  assert ret[-1].op is Ops.SINK
-  return ret
+from test.helpers import to_uops_list

 class TestSafeCast(unittest.TestCase):
  def test_cast_folds(self):
--- a/test/null/test_validate_oob.py
+++ b/test/null/test_validate_oob.py
@@ -3,7 +3,7 @@ from tinygrad import dtypes, Variable
 from tinygrad.dtype import AddrSpace
 from tinygrad.helpers import Context
 from tinygrad.uop.ops import Ops, UOp, AxisType
-from test.test_uops import to_uops_list
+from test.helpers import to_uops_list

 class TestValidateOOB(unittest.TestCase):
  """Test z3 validation of index bounds for different ALU ops and patterns."""
--- a/test/opt/init.py
+++ b/test/opt/init.py
--- a/test/opt/test_kernel_opts.py
+++ b/test/opt/test_kernel_opts.py
@@ -3,7 +3,7 @@ from tinygrad import Device, Tensor, dtypes
 from tinygrad.codegen.opt import Opt, OptOps, KernelOptError

 # TODO: write a clean version of this
-from test.test_linearizer import helper_linearizer_opt
+from test.backend.test_linearizer import helper_linearizer_opt

 class TestKernelOpts(unittest.TestCase):
  @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_local, "test requires locals")
--- a/test/opt/test_tensor_cores.py
+++ b/test/opt/test_tensor_cores.py
@@ -14,7 +14,7 @@ from tinygrad.codegen.opt import Opt, OptOps, KernelOptError
 from tinygrad.codegen.opt.tc import amd_cdna_1616128

 # TODO: write a clean version of this
-from test.test_linearizer import helper_realized_ast, helper_linearizer_opt
+from test.backend.test_linearizer import helper_realized_ast, helper_linearizer_opt

 # NOTE: get_program always passes in Device[Device.DEFAULT].renderer explicitly for process_replay!!!