remove getenv(CI) (#16365)

gone everywhere except test_interop, because torch MPS does not work in actions
2026-06-08 05:54:59 +08:00 · 2026-05-25 17:23:33 -07:00
parent 695a0069ed
commit 8ddd1328df
24 changed files with 60 additions and 90 deletions
--- a/extra/optimization/test_beam_search.py
+++ b/extra/optimization/test_beam_search.py
@@ -1,7 +1,6 @@
 import unittest
 import numpy as np

-from test.helpers import CI
 from tinygrad.helpers import BEAM, Timing, prod
 from tinygrad import Variable, Device, Tensor
 from tinygrad.nn import Conv2d
@@ -65,7 +64,7 @@ class TestBeamSearch(unittest.TestCase):
    actual = a.numpy()
    np.testing.assert_allclose(actual, desired)

-  @unittest.skipIf(CI, "flaky. CL_OUT_OF_RESOURCES")
+  @unittest.skip("flaky. CL_OUT_OF_RESOURCES")
  def test_conv_beam(self):
    c = Conv2d(3, 16, (3,3))
    x = rand(1,3,32,32)
--- a/test/backend/test_dtype.py
+++ b/test/backend/test_dtype.py
@@ -2,13 +2,13 @@ import contextlib, unittest, math
 import numpy as np
 import torch
 from typing import Any, List
-from tinygrad.helpers import getenv, DEBUG, EMULATED_DTYPES
+from tinygrad.helpers import getenv, DEBUG, EMULATED_DTYPES, DEV
 from tinygrad.dtype import DType, DTYPES_DICT, least_upper_dtype, fp8_to_float, float_to_fp8, _to_np_dtype, _to_torch_dtype, truncate
 from tinygrad.renderer.ptx import PTXRenderer
 from tinygrad.renderer.nir import NIRRenderer
 from tinygrad import Context, Device, Tensor, dtypes
 from hypothesis import given, settings, strategies as strat
-from test.helpers import rand_for_dtype, CI
+from test.helpers import rand_for_dtype
 from test.unit.test_dtype_spec import _assert_eq, core_dtypes, dtype_ints, dtype_floats, FP8E4M3_MAX, FP8E5M2_MAX, FP8E4M3FNUZ_MAX, FP8E5M2FNUZ_MAX
 import pytest
 pytestmark = pytest.mark.filterwarnings("ignore")
@@ -225,7 +225,7 @@ class TestFloatDType(TestDType):
@unittest.skipUnless(dtypes.double in supported_dtypes, f"no double on {Device.DEFAULT}")
 class TestDoubleDType(TestDType):
  DTYPE = dtypes.double
-  @unittest.skipIf((CI and Device.DEFAULT in {"CUDA", "NV"}) or \
+  @unittest.skipIf((DEV.interface.startswith("MOCK") and Device.DEFAULT in {"CUDA", "NV"}) or \
   isinstance(Device[Device.DEFAULT].renderer, (PTXRenderer, NIRRenderer)), "conversion not supported on CI CUDA, PTX, and NIR")  # TODO: why not?
  def test_float64_increased_precision(self):
    for func in [
--- a/test/backend/test_dtype_alu.py
+++ b/test/backend/test_dtype_alu.py
@@ -7,7 +7,6 @@ from tinygrad.runtime.ops_python import from_storage_scalar
 from tinygrad.renderer.ptx import PTXRenderer
 from tinygrad.renderer.nir import NIRRenderer
 from tinygrad.uop import Ops
-from test.helpers import CI
 import numpy as np
 import pytest
 from hypothesis import assume, given, strategies as strat, settings
@@ -331,12 +330,12 @@ class TestDTypeALU(unittest.TestCase):
  @given(ht.bool, ht.bool, strat.sampled_from(((operator.add, operator.add), (operator.mul, operator.mul))))
  def test_bool(self, a, b, op): universal_test(a, b, dtypes.bool, op)

-  @unittest.skipIf(not CI and Device.DEFAULT == "METAL", "broken on local M3")
  @given(ht.int32, ht.int32, ht.float32, strat.sampled_from(integer_binary_operations), strat.sampled_from(binary_operations))
  def test_int32_midcast_float(self, a, b, c, op1, op2): universal_test_midcast(a, b, c, op1, op2, dtypes.int32, dtypes.float32)

-  # Metal and CUDA and HIP and NIR behave differently than numpy in CI for overflows
-  skip_overflow = (CI and Device.DEFAULT in {"AMD", "NV", "CUDA"}) or isinstance(Device[Device.DEFAULT].renderer, NIRRenderer)
+  # Metal and (MOCK)CUDA and HIP and NIR behave differently than numpy for overflows
+  skip_overflow = ((DEV.interface.startswith("MOCK") and Device.DEFAULT in {"AMD", "NV", "CUDA"})
+                   or isinstance(Device[Device.DEFAULT].renderer, NIRRenderer))
  @given(strat.floats(width=32, min_value=0, max_value=10.0) if skip_overflow else ht.float32,
         strat.floats(width=32, min_value=0, max_value=10.0) if skip_overflow else ht.float32,
         ht.int32, strat.sampled_from(binary_operations), strat.sampled_from(integer_binary_operations))
--- a/test/backend/test_interop.py
+++ b/test/backend/test_interop.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-import unittest
+import unittest, os
 import torch
 import numpy as np

@@ -7,7 +7,6 @@ from tinygrad.helpers import DEV
 from tinygrad.tensor import Tensor
 from tinygrad.device import Device
 from tinygrad.dtype import _from_torch_dtype, _to_torch_dtype
-from test.helpers import CI

 MOCKGPU = DEV.interface.startswith("MOCK")

@@ -28,7 +27,7 @@ class TestInterop(unittest.TestCase):
    tg_out = tg_data[:, :, 0] * 0.2989 + tg_data[:, :, 1] * 0.5870 + tg_data[:, :, 2] * 0.1140
    tg_res = tg_out.numpy()

-    if self.torch_device == "mps" and CI:
+    if self.torch_device == "mps" and os.getenv("CI", "") != "":
      # MPS backend out of memory: https://discuss.pytorch.org/t/mps-back-end-out-of-memory-on-github-action/189773
      # Calculate expected value on cpu.
      inp = inp.cpu()
--- a/test/backend/test_ops.py
+++ b/test/backend/test_ops.py
@@ -1,4 +1,4 @@
-import time, math, unittest, functools, platform, warnings
+import time, math, unittest, functools, platform, warnings, sys
 import numpy as np
 from typing import List, Callable
 import torch
@@ -7,7 +7,6 @@ from tinygrad import Tensor, Device, dtypes
 from tinygrad.tensor import _to_np_dtype
 from tinygrad.renderer.cstyle import QCOMCLRenderer
 from tinygrad.renderer.nir import NIRRenderer
-from test.helpers import CI

 TINY_BACKEND = getenv("TINY_BACKEND")
 if TINY_BACKEND:
@@ -74,7 +73,7 @@ def helper_test_op(shps, torch_fxn, tinygrad_fxn=None, atol=1e-6, rtol=1e-3, gra
    for i, (t, torch_grad) in enumerate(zip(tiny_grads, torch_grads)):
      compare(f"backward pass tensor {i}", t.numpy(), torch_grad.detach().cpu().numpy(), atol=grad_atol, rtol=grad_rtol)

-  if not CI:
+  if sys.stdout.isatty():
    print("\ntesting %40r   torch/tinygrad fp: %.2f / %.2f ms  bp: %.2f / %.2f ms " % \
          (shps, torch_fp*1000, tinygrad_fp*1000, torch_fbp*1000, tinygrad_fbp*1000), end="")

@@ -103,7 +102,7 @@ class TestOps(unittest.TestCase):
    with self.assertRaises(expected) as tinygrad_cm:
      tinygrad_fxn(*tst)
    if exact: self.assertEqual(str(torch_cm.exception), str(tinygrad_cm.exception))
-    if not CI: print("\ntesting %40r   torch/tinygrad exception: %s / %s" % (shps, torch_cm.exception, tinygrad_cm.exception), end="")
+    if sys.stdout.isatty(): print("\ntesting %40r   torch/tinygrad exception: %s / %s" % (shps, torch_cm.exception, tinygrad_cm.exception), end="")

  def test_full_like(self):
    a = Tensor([[1,2,3],[4,5,6]], dtype=dtypes.float32)
--- a/test/backend/test_profiler.py
+++ b/test/backend/test_profiler.py
@@ -5,7 +5,6 @@ from tinygrad.device import Buffer, BufferSpec, Compiled, ProfileDeviceEvent, Pr
 from tinygrad.runtime.support.hcq import HCQCompiled
 from tinygrad.engine.realize import get_runtime
 from tinygrad.codegen import to_program
-from test.helpers import CI

 MOCKGPU = DEV.interface.startswith("MOCK")
 def _dev_base(d):
@@ -145,7 +144,8 @@ class TestProfiler(unittest.TestCase):
    assert len(graph_evs) == 2, "2 graph events are expected"
    assert len(graph_evs[0].ents) == 2, "two entities are expected"

-  @unittest.skipIf(CI or not issubclass(type(Device[Device.DEFAULT]), HCQCompiled), "skip CI")
+  @unittest.skipIf(MOCKGPU, "skip MOCKGPU")
+  @unittest.skipUnless(issubclass(type(Device[Device.DEFAULT]), HCQCompiled), "must be HCQ")
  def test_dev_jitter_matrix(self):
    dev_cnt = 6
    try: devs = [Device[f"{Device.DEFAULT}:{i}"] for i in range(dev_cnt)]
--- a/test/backend/test_randomness.py
+++ b/test/backend/test_randomness.py
@@ -1,14 +1,14 @@
 import unittest, math

 from tinygrad import dtypes, Tensor, Device
-from tinygrad.helpers import getenv
+from tinygrad.helpers import getenv, DEV
 from tinygrad.codegen import to_program

 from tinygrad.uop.ops import Ops
 from tinygrad.renderer.ptx import PTXRenderer
 from tinygrad.renderer.nir import NIRRenderer
 from tinygrad.renderer.isa.x86 import X86Renderer
-from test.helpers import not_support_multi_device, needs_second_gpu, CI
+from test.helpers import not_support_multi_device, needs_second_gpu
 from test.unit.test_randomness import equal_distribution, normal_test

 import numpy as np
@@ -48,7 +48,7 @@ class TestRandomness(unittest.TestCase):
    assert nx[nx == 0].size > 0
    equal_distribution(lambda *x: Tensor.rand(*x, dtype=dtypes.float16), torch.rand, lambda x: np.random.rand(*x), shape=(2, N, N))

-  @unittest.skipIf(CI and Device.DEFAULT in {"NV", "CUDA"}, "gpuocelot doesn't support certain ops needed for threefry")
+  @unittest.skipIf(DEV.interface.startswith("MOCK") and Device.DEFAULT in {"NV", "CUDA"}, "gpuocelot doesn't support certain ops needed for threefry")
  def test_threefry_against_reference(self):
    Tensor.manual_seed(1337)

--- a/test/backend/test_schedule.py
+++ b/test/backend/test_schedule.py
@@ -10,9 +10,8 @@ from hypothesis import assume, given, strategies as strat
 from tinygrad import nn, dtypes, Device, Tensor, Variable
 from tinygrad.dtype import DType
 from tinygrad.uop.ops import UOp, Ops, UPat
-from tinygrad.helpers import DEBUG, OSX, GlobalCounters, Context, getenv, all_same, temp
+from tinygrad.helpers import DEBUG, DEV, OSX, GlobalCounters, Context, getenv, all_same, temp
 from tinygrad.engine.realize import compile_linear, run_linear
-from test.helpers import CI

 supported_dtypes = Device[Device.DEFAULT].renderer.supported_dtypes()

@@ -115,7 +114,6 @@ class TestSchedule(unittest.TestCase):
    run_linear(*check_schedule(b, 1))
    np.testing.assert_allclose(b.numpy(), np.broadcast_to(a.numpy().astype(np.float16), (2, 4, 4))+2, rtol=1e-3)

-  @unittest.skipIf(CI and Device.DEFAULT == "NV", "crashes on NV CI")
  def test_add_chain_buffers(self):
    N = 31
    with Context(TRACK_MATCH_STATS=0, DEBUG=0):
@@ -1114,7 +1112,7 @@ class TestSchedule(unittest.TestCase):
      self.assertListEqual(a.tolist(), [[1.]*shape[1]]*shape[0])

 class TestLimitBufs(unittest.TestCase):
-  @unittest.skipIf(CI and Device.DEFAULT == "NV", "crashes on NV CI")
+  @unittest.skipIf(DEV.interface.startswith("MOCK") and Device.DEFAULT == "NV", "crashes in ocelot")
  def test_limit_bufs_with_var(self):
    N = 31
    with Context(TRACK_MATCH_STATS=0, DEBUG=0):
--- a/test/backend/test_transcendental.py
+++ b/test/backend/test_transcendental.py
@@ -2,7 +2,6 @@ import unittest
 from tinygrad import Tensor, Device, dtypes
 from tinygrad.tensor import _to_np_dtype
 from tinygrad.helpers import Context, getenv, DEV, OSX
-from test.helpers import CI
 from test.backend.test_schedule import check_schedule
 from test.backend.test_dtype_alu import ht, dtypes_float
 import numpy as np
@@ -32,7 +31,7 @@ class TestTranscendentalMath(unittest.TestCase):
    ([(Tensor.sin, np.sin)] if dtypes.ulong in supported_dtypes else [])))
  def test_float32(self, x, op):
    # wrong nan behavior on Vulkan
-    if (math.isnan(x) or (x < 0 and op[0] == Tensor.log)) and CI and Device.DEFAULT == "WEBGPU" and not OSX: return
+    if (math.isnan(x) or (x < 0 and op[0] == Tensor.log)) and Device.DEFAULT == "WEBGPU" and not OSX: return
    with Context(TRANSCENDENTAL=2), np.errstate(all='ignore'):
      np.testing.assert_allclose(op[0](Tensor([x], dtype=dtypes.float32)).numpy(),
                                 op[1](np.array([x], dtype=_to_np_dtype(dtypes.float32))),
@@ -43,7 +42,7 @@ class TestTranscendentalMath(unittest.TestCase):
    ([(Tensor.sin, np.sin)] if dtypes.ulong in supported_dtypes else [])))
  def test_float16(self, x, op):
    # wrong nan behavior on Vulkan
-    if (math.isnan(x) or (x < 0 and op[0] == Tensor.log)) and CI and Device.DEFAULT == "WEBGPU" and not OSX: return
+    if (math.isnan(x) or (x < 0 and op[0] == Tensor.log)) and Device.DEFAULT == "WEBGPU" and not OSX: return
    with Context(TRANSCENDENTAL=2), np.errstate(all='ignore'):
      np.testing.assert_allclose(op[0](Tensor([x], dtype=dtypes.float16)).numpy(),
                                 op[1](np.array([x], dtype=_to_np_dtype(dtypes.float16))),
@@ -117,7 +116,7 @@ class TestFloat16Log2(unittest.TestCase):
        np.testing.assert_allclose(result, expected, rtol=1e-3, err_msg=f"log2({val})")

  @unittest.skipUnless(dtypes.float16 in supported_dtypes, f"no float16 on {Device.DEFAULT}")
-  @unittest.skipIf(Device.DEFAULT == "WEBGPU" and CI, "Nan handling differs on Vulkan")
+  @unittest.skipIf(Device.DEFAULT == "WEBGPU" and not OSX, "Nan handling differs on Vulkan")
  def test_float16_log2_special(self):
    # special values: inf, -inf, nan, 0, negative
    with Context(TRANSCENDENTAL=2), np.errstate(all='ignore'):
--- a/test/backend/test_uops.py
+++ b/test/backend/test_uops.py
@@ -11,7 +11,7 @@ from tinygrad.engine.realize import run_linear
 from tinygrad.codegen import to_program
 from tinygrad.codegen.opt import Opt, OptOps
 from tinygrad.renderer.ptx import PTXRenderer
-from test.helpers import to_uops_list, CI
+from test.helpers import to_uops_list

 def run_uops(uops_list:list[UOp], bufs:list[Buffer]):
  buf_uops = [UOp.new_buffer(b.device, b.size, b.dtype) for b in bufs]
@@ -173,8 +173,6 @@ class TestBoolUOps(TestUOps):
  def test_where_bool(self): self._test_top_bool_fxn(Ops.WHERE, lambda a,b,c: b if a else c)

 class TestLocalAccess(unittest.TestCase):
-  # NOTE: this is failing on METAL CI, no idea why. Works locally.
-  @unittest.skipIf(Device.DEFAULT == "METAL" and CI, "failing only in CI")
  @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_shared, "test requires shared memory")
  def test_local_basic(self):
    uops = []
--- a/test/external/external_test_example.py
+++ b/test/external/external_test_example.py
@@ -1,8 +1,7 @@
-import unittest
+import unittest, sys
 from tinygrad import Device
 from tinygrad.tensor import Tensor
 from tinygrad.helpers import getenv, OSX
-from test.helpers import CI

 def multidevice_test(fxn):
  exclude_devices = getenv("EXCLUDE_DEVICES", "").split(",")
@@ -10,15 +9,15 @@ def multidevice_test(fxn):
    for device in Device._devices:
      # broken on OSX USB AMD, why?
      if device in ["DISK", "NPY", "FAKE", "DSP", "NULL"] or (OSX and device in ["AMD"]): continue
-      if not CI: print(device)
+      if sys.stdout.isatty(): print(device)
      if device in exclude_devices:
-        if not CI: print(f"WARNING: {device} test is excluded")
+        if sys.stdout.isatty(): print(f"WARNING: {device} test is excluded")
        continue
      with self.subTest(device=device):
        try:
          Device[device]
        except Exception:
-          if not CI: print(f"WARNING: {device} test isn't running")
+          if sys.stdout.isatty(): print(f"WARNING: {device} test isn't running")
          continue
        fxn(self, device)
  return ret
--- a/test/external/external_test_hcq.py
+++ b/test/external/external_test_hcq.py
@@ -1,10 +1,9 @@
 import unittest, ctypes, struct, time, array
 from tinygrad import Device, Tensor, dtypes
-from tinygrad.helpers import to_mv
+from tinygrad.helpers import to_mv, DEV
 from tinygrad.device import Buffer, BufferSpec
 from tinygrad.engine.realize import get_runtime
 from tinygrad.codegen import to_program
-from test.helpers import CI

 def _time_queue(q, d):
  st = time.perf_counter()
@@ -149,7 +148,7 @@ class TestHCQ(unittest.TestCase):
    val = TestHCQ.b.uop.buffer.as_memoryview().cast("f")[1]
    assert val == 0.0, f"got val {val}, should not be updated"

-  @unittest.skipIf(CI, "Can't handle async update on CPU")
+  @unittest.skipIf(DEV.interface.startswith("MOCK"), "Can't handle async update on CPU")
  def test_wait_signal(self):
    temp_signal = TestHCQ.d0._alloc_signal(value=0)
    TestHCQ.compute_queue().wait(temp_signal, value=1).signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0)
@@ -160,7 +159,7 @@ class TestHCQ(unittest.TestCase):
    TestHCQ.d0._wait_signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value, timeout=100)
    TestHCQ.d0.timeline_value += 1

-  @unittest.skipIf(CI, "Can't handle async update on CPU")
+  @unittest.skipIf(DEV.interface.startswith("MOCK"), "Can't handle async update on CPU")
  def test_wait_copy_signal(self):
    temp_signal = TestHCQ.d0._alloc_signal(value=0)
    TestHCQ.copy_queue().wait(temp_signal, value=1).signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0)
--- a/test/external/external_test_jit_on_models.py
+++ b/test/external/external_test_jit_on_models.py
@@ -3,7 +3,7 @@ import unittest
 import numpy as np
 from tinygrad import Tensor, dtypes
 from tinygrad.engine.jit import TinyJit
-from test.helpers import derandomize_model, CI
+from test.helpers import derandomize_model

 from examples.llama import Transformer

@@ -27,7 +27,6 @@ class TestJittedModels(unittest.TestCase):
    helper_test_jitted_correctness(lambda: (Tensor([[1,]]),), test, test_jit)
    dtypes.default_float = old_float

-  @unittest.skipUnless(not CI, "huge for CI")
  def test_jitted_stable_diffusion(self):
    from examples.stable_diffusion import UNetModel, unet_params
    model = UNetModel(**unet_params)
--- a/test/helpers.py
+++ b/test/helpers.py
@@ -8,14 +8,11 @@ from tinygrad.tensor import _to_np_dtype
 from tinygrad.codegen import to_program
 from tinygrad.dtype import DType
 from tinygrad.nn.state import get_parameters
-from tinygrad.helpers import T, Target
+from tinygrad.helpers import T, Target, DEV
 from tinygrad.renderer import Renderer
 from tinygrad.codegen import full_rewrite_to_sink, line_rewrite, pm_linearize_cleanups
 from tinygrad.codegen.late.linearizer import linearize

-# TODO: remove this everywhere!
-CI = os.getenv("CI", "") != ""
-
 # decorator to skip slow tests by default, run with RUN_SLOW=1 to include them
 slow = unittest.skipUnless(os.getenv("RUN_SLOW"), "slow test, set RUN_SLOW=1 to run")
 from tinygrad.runtime.ops_python import PythonProgram, PythonRenderer, PythonCompiler
@@ -100,7 +97,7 @@ def to_uops_list(u:list[UOp], ren=None) -> list[UOp]:

 def not_support_multi_device():
  # CL and CUDA don't support multi device if in CI
-  return CI and Device.DEFAULT in ("CL", "CUDA")
+  return (Device.DEFAULT == "CL" and Device[Device.DEFAULT].count() < 2) or (Device.DEFAULT == "CUDA" and DEV.interface.startswith("MOCK"))

 def needs_second_gpu(fn):
  @functools.wraps(fn)
--- a/test/models/test_end2end.py
+++ b/test/models/test_end2end.py
@@ -1,19 +1,18 @@
 import torch
 from torch import nn
-import unittest
+import unittest, sys
 import numpy as np
 from tinygrad.nn.state import get_parameters, get_state_dict
 from tinygrad.nn import optim, Linear, Conv2d, BatchNorm2d
 from tinygrad.tensor import Tensor
 from extra.datasets import fetch_mnist
-from test.helpers import CI

 def compare_tiny_torch(model, model_torch, X, Y):
  with Tensor.train():
    model_torch.train()
    model_state_dict = get_state_dict(model)
    for k,v in model_torch.named_parameters():
-      if not CI: print(f"initting {k} from torch")
+      if sys.stdout.isatty(): print(f"initting {k} from torch")
      model_state_dict[k].assign(Tensor(v.detach().numpy())).realize()

    optimizer = optim.SGD(get_parameters(model), lr=0.001)
@@ -35,14 +34,14 @@ def compare_tiny_torch(model, model_torch, X, Y):
    loss_torch.backward()

    # assert losses match
-    if not CI: print(loss.realize().numpy())
-    if not CI: print(loss_torch.detach().numpy())
+    if sys.stdout.isatty(): print(loss.realize().numpy())
+    if sys.stdout.isatty(): print(loss_torch.detach().numpy())
    np.testing.assert_allclose(loss.realize().numpy(), loss_torch.detach().numpy(), atol=1e-4)

    for k,v in list(model_torch.named_parameters())[::-1]:
      g = model_state_dict[k].grad.numpy()
      gt = v.grad.detach().numpy()
-      if not CI: print("testing grads", k, model_state_dict[k].grad.dtype)
+      if sys.stdout.isatty(): print("testing grads", k, model_state_dict[k].grad.dtype)
      np.testing.assert_allclose(g, gt, atol=1e-3, err_msg=f'grad mismatch {k}')

    # take the steps
@@ -51,7 +50,7 @@ def compare_tiny_torch(model, model_torch, X, Y):

    # assert weights match
    for k,v in model_torch.named_parameters():
-      if not CI: print("testing weight", k, model_state_dict[k].dtype)
+      if sys.stdout.isatty(): print("testing weight", k, model_state_dict[k].dtype)
      np.testing.assert_allclose(model_state_dict[k].numpy(), v.detach().numpy(), atol=1e-3, err_msg=f'weight mismatch {k}')

 def get_mnist_data():
--- a/test/null/test_device.py
+++ b/test/null/test_device.py
@@ -5,7 +5,6 @@ from tinygrad import Tensor
 from tinygrad.device import Device, Compiler, enumerate_devices_str
 from tinygrad.helpers import diskcache_get, diskcache_put, getenv, Context, Target, WIN, OSX, DEV
 from tinygrad.runtime.support.c import DLL
-from test.helpers import CI

 class TestDevice(unittest.TestCase):
  def test_canonicalize(self):
@@ -67,7 +66,7 @@ class TestDevice(unittest.TestCase):
    self.assertNotEqual(result.returncode, 0)
    self.assertIn(b"deprecated", result.stderr)

-  @unittest.skipIf(WIN and CI, "skipping windows test") # TODO: subprocess causes memory violation?
+  @unittest.skipIf(WIN, "skipping windows test") # TODO: subprocess causes memory violation?
  def test_env_overwrite_default_compiler(self):
    if Device.DEFAULT == "CPU":
      from tinygrad.runtime.support.compiler_cpu import CPULLVMCompiler, ClangJITCompiler
@@ -95,7 +94,7 @@ class TestDevice(unittest.TestCase):
                        shell=True, check=True, env={**os.environ, "DEV": "AMD:HIP"})
    else: self.skipTest("only run on CPU/AMD")

-  @unittest.skipIf(WIN and CI, "skipping windows test")
+  @unittest.skipIf(WIN, "skipping windows test")
  def test_env_online(self):
    from tinygrad.runtime.support.compiler_cpu import CPULLVMCompiler, ClangJITCompiler
    try: _, _ = CPULLVMCompiler(), ClangJITCompiler()
--- a/test/null/test_winograd.py
+++ b/test/null/test_winograd.py
@@ -1,7 +1,6 @@
 import unittest, sys
 from tinygrad import Tensor, GlobalCounters, dtypes, Context
-from tinygrad.helpers import Profiling, WINO
-from test.helpers import CI
+from tinygrad.helpers import WINO

@unittest.skipIf(sys.platform.startswith("win"), "flaky on Windows")
 class TestWinograd(unittest.TestCase):
@@ -11,11 +10,6 @@ class TestWinograd(unittest.TestCase):
  def tearDown(self):
    WINO.value = self.old

-  def test_profile(self):
-    x,w = Tensor.rand(1,4,9,9).realize(), Tensor.rand(4,4,3,3).realize()
-    with Profiling(enabled=not CI, sort='time'):
-      Tensor.conv2d(x,w).realize()
-
  def test_forward_kernels(self):
    x,w = Tensor.rand(1,4,9,9).realize(), Tensor.rand(4,4,3,3).realize()
    out = Tensor.conv2d(x,w)
--- a/test/speed/external_test_copy_speed.py
+++ b/test/speed/external_test_copy_speed.py
@@ -1,7 +1,6 @@
-import unittest, numpy as np
+import unittest, numpy as np, os
 from tinygrad import Tensor, Device, TinyJit
-from tinygrad.helpers import Timing, OSX, getenv
-from test.helpers import CI
+from tinygrad.helpers import Timing, getenv
 import multiprocessing.shared_memory as shared_memory

 N = getenv("NSZ", 256)
@@ -12,7 +11,7 @@ class TestCopySpeed(unittest.TestCase):
  def testCopySHMtoDefault(self):
    s = shared_memory.SharedMemory(name="test_X", create=True, size=N*N*4)
    s.close()
-    if CI and not OSX:
+    if os.path.exists("/dev/shm"):
      t = Tensor.empty(N, N, device="disk:/dev/shm/test_X").realize()
    else:
      t = Tensor.empty(N, N, device="disk:shm:test_X").realize()
@@ -77,11 +76,8 @@ class TestCopySpeed(unittest.TestCase):
        Device[Device.DEFAULT].synchronize()
      np.testing.assert_equal(t.numpy(), x.numpy())

-  @unittest.skipIf(CI, "CI doesn't have 6 GPUs")
-  @unittest.skipIf(Device.DEFAULT != "CL", "only test this on CL")
+  @unittest.skipIf(Device.DEFAULT != "CL" or Device[Device.DEFAULT].count() != 6, "only test this on CL, with 6 gpus")
  def testCopyCPUto6GPUs(self):
-    from tinygrad.runtime.ops_cl import CLDevice
-    if len(CLDevice.device_ids) != 6: raise unittest.SkipTest("computer doesn't have 6 GPUs")
    t = Tensor.ones(N, N, device="CPU").contiguous().realize()
    print(f"buffer: {t.nbytes()*1e-9:.2f} GB")
    for _ in range(3):
--- a/test/speed/external_test_specific_conv.py
+++ b/test/speed/external_test_specific_conv.py
@@ -1,9 +1,9 @@
 import unittest
 from tinygrad import Tensor, Device, dtypes
-from test.helpers import CI
+from tinygrad.helpers import DEV
 # similar to test/external/external_test_gpu_ast.py, but universal

-@unittest.skipIf(Device.DEFAULT in {"CUDA", "NV"} and CI, "slow on CUDA CI")
+@unittest.skipIf(Device.DEFAULT in {"CUDA", "NV"} and DEV.interface.startswith("MOCK"), "slow on ocelot")
 class TestSpecific(unittest.TestCase):
  # from openpilot

--- a/test/speed/external_test_speed_v_torch.py
+++ b/test/speed/external_test_speed_v_torch.py
@@ -9,11 +9,11 @@ import torch
 torch.set_num_threads(1)
 import time
 import numpy as np
+import sys
 np.set_printoptions(linewidth=160)
 from tinygrad import Tensor, Device, GlobalCounters, TinyJit
 from tinygrad.nn import Conv2d
 from tinygrad.helpers import colorize_float, getenv, DEV
-from test.helpers import CI

 IN_CHANS = [int(x) for x in getenv("IN_CHANS", "4,16,64").split(",")]

@@ -96,7 +96,7 @@ def helper_test_generic(name, f1, f1_args, f2, f2_args):
  desc = "faster" if et_torch > et_tinygrad else "slower"
  flops = save_ops*1e-6
  mem = save_mem*1e-6
-  print(("\r" if not CI else "")+f"{name:42s} {et_torch:7.2f} ms ({flops/et_torch:9.2f} GFLOPS {mem/et_torch:7.2f} GB/s) in torch, {et_tinygrad:7.2f} ms ({flops/et_tinygrad:9.2f} GFLOPS {mem/et_tinygrad:7.2f} GB/s) in tinygrad, {colorize_float(et_tinygrad/et_torch)} {desc} {flops:10.2f} MOPS {mem:8.2f} MB")  # noqa: E501
+  print(("\r" if sys.stdout.isatty() else "")+f"{name:42s} {et_torch:7.2f} ms ({flops/et_torch:9.2f} GFLOPS {mem/et_torch:7.2f} GB/s) in torch, {et_tinygrad:7.2f} ms ({flops/et_tinygrad:9.2f} GFLOPS {mem/et_tinygrad:7.2f} GB/s) in tinygrad, {colorize_float(et_tinygrad/et_torch)} {desc} {flops:10.2f} MOPS {mem:8.2f} MB")  # noqa: E501
  atol, rtol = (1e-2, 1e-2) if torch_dt == torch.float16 else (1e-3, 1e-3)
  np.testing.assert_allclose(val_tinygrad, val_torch, atol=atol, rtol=rtol)

--- a/test/test_tiny.py
+++ b/test/test_tiny.py
@@ -2,7 +2,6 @@
 import unittest, random
 from tinygrad import Tensor, Context, Variable, TinyJit, dtypes, Device, nn
 from tinygrad.helpers import getenv
-from test.helpers import CI

 class TestTiny(unittest.TestCase):

@@ -112,7 +111,7 @@ class TestTiny(unittest.TestCase):
  # *** a model ***

  # TODO: this is failing because of how swizzling rewrites the ShapeTracker of the final STORE
-  @unittest.skipIf(CI and Device.DEFAULT == "DSP", "failing because of make things that can't be images not images")
+  @unittest.skipIf(Device.DEFAULT == "DSP", "failing because of make things that can't be images not images")
  def test_mnist(self):
    layers = [
      nn.Conv2d(1, 32, 5), Tensor.relu,
@@ -131,7 +130,7 @@ class TestTiny(unittest.TestCase):
    self.assertEqual(len(probs[0]), 10)

  # TODO: this is failing because of how swizzling rewrites the ShapeTracker of the final STORE
-  @unittest.skipIf(CI and Device.DEFAULT == "DSP", "failing because of make things that can't be images not images")
+  @unittest.skipIf(Device.DEFAULT == "DSP", "failing because of make things that can't be images not images")
  def test_mnist_backward(self):
    # NOTE: we don't have the whole model here for speed
    layers = [
--- a/test/testextra/test_bench_log.py
+++ b/test/testextra/test_bench_log.py
@@ -2,12 +2,12 @@ import unittest, time
 from unittest.case import skipIf

 from extra.bench_log import BenchEvent, InstantBenchEvent, WallTimeEvent, KernelTimeEvent, log_event_instant, _events, clear_events
-from tinygrad.helpers import Context
+from tinygrad.helpers import Context, DEV
 from tinygrad.tensor import Tensor
 from tinygrad.device import Device
-from test.helpers import CI

-_SKIP_KERNEL_TIMING = Device.DEFAULT == "WEBGPU"  # WEBGPU kernel timing not supported
+# WEBGPU kernel timing not supported, ocelot CUDA is inaccurate
+_SKIP_KERNEL_TIMING = Device.DEFAULT == "WEBGPU" or (Device.DEFAULT == "CUDA" and DEV.interface.startswith("MOCK"))

 class TestBenchLog(unittest.TestCase):
  def setUp(self):
@@ -38,7 +38,7 @@ class TestBenchLog(unittest.TestCase):
      self.assertGreater(_events[event]["wall"][0], 0)
      self.assertGreater(_events[event]["wall"][1], 0)

-  @skipIf(CI or _SKIP_KERNEL_TIMING, "ci timing is not accurate")
+  @skipIf(_SKIP_KERNEL_TIMING, "ci timing is not accurate")
  def test_log_single_kernel_time(self):
    wall_times = []

@@ -55,7 +55,7 @@ class TestBenchLog(unittest.TestCase):
      self.assertLess(_events[event]["kernel"][0], wall_times[0])
      self.assertGreater(_events[event]["kernel"][0], 0)

-  @skipIf((CI and Device.DEFAULT == "CUDA") or _SKIP_KERNEL_TIMING, "ci cuda timing is not accurate")
+  @skipIf(_SKIP_KERNEL_TIMING, "ci cuda timing is not accurate")
  def test_interleaved_wall_kernel_time(self):
    wall_times = []
    with Context(DEBUG=2):
@@ -77,7 +77,7 @@ class TestBenchLog(unittest.TestCase):
      self.assertLess(_events[event]["kernel"][0], wall_times[0])
      self.assertGreater(_events[event]["kernel"][0], 0)

-  @skipIf((CI and Device.DEFAULT == "CUDA") or _SKIP_KERNEL_TIMING, "ci cuda timing is not accurate")
+  @skipIf(_SKIP_KERNEL_TIMING, "ci cuda timing is not accurate")
  def test_stacked_wall_kernel_time(self):
    with Context(DEBUG=2):
      for event in BenchEvent:
--- a/test/unit/test_assign.py
+++ b/test/unit/test_assign.py
@@ -4,7 +4,6 @@ import numpy as np
 from tinygrad import dtypes, Tensor, TinyJit, GlobalCounters, Variable
 from tinygrad.uop.ops import Ops, UOp
 from tinygrad.helpers import temp, DEV, Context
-from test.helpers import CI

 N = 200  # has to be bigger than the cache to fail

@@ -189,7 +188,7 @@ class TestAssign(unittest.TestCase):
    new = a + times_a
    np.testing.assert_allclose(new.numpy(), 8)

-  @unittest.skipIf(CI and DEV.renderer == "LVP", "flaky in CI")
+  @unittest.skipIf(DEV.renderer == "LVP", "flaky in CI")
  def test_double_assign(self):
    a = Tensor.ones(4).contiguous().realize()
    a += 1
--- a/test/unit/test_shm_tensor.py
+++ b/test/unit/test_shm_tensor.py
@@ -2,11 +2,10 @@ import unittest
 import multiprocessing.shared_memory as shared_memory
 from tinygrad.helpers import WIN
 from tinygrad import Tensor, Device
-from test.helpers import CI
 import numpy as np

 class TestRawShmBuffer(unittest.TestCase):
-  @unittest.skipIf(WIN and CI, "only fails on CI windows instance")
+  @unittest.skipIf(WIN, "only fails on CI windows instance")
  def test_e2e(self):
    t = Tensor.randn(2, 2, 2).realize()