From 8ddd1328df26ef970ee6dbf3379b285181f7e242 Mon Sep 17 00:00:00 2001
From: Christopher Milan <chrismilan@ucla.edu>
Date: Mon, 25 May 2026 17:23:33 -0700
Subject: [PATCH] remove getenv(CI) (#16365)

gone everywhere except test_interop, because torch MPS does not work in actions
---
 extra/optimization/test_beam_search.py       |  3 +--
 test/backend/test_dtype.py                   |  6 +++---
 test/backend/test_dtype_alu.py               |  7 +++----
 test/backend/test_interop.py                 |  5 ++---
 test/backend/test_ops.py                     |  7 +++----
 test/backend/test_profiler.py                |  4 ++--
 test/backend/test_randomness.py              |  6 +++---
 test/backend/test_schedule.py                |  6 ++----
 test/backend/test_transcendental.py          |  7 +++----
 test/backend/test_uops.py                    |  4 +---
 test/external/external_test_example.py       |  9 ++++-----
 test/external/external_test_hcq.py           |  7 +++----
 test/external/external_test_jit_on_models.py |  3 +--
 test/helpers.py                              |  7 ++-----
 test/models/test_end2end.py                  | 13 ++++++-------
 test/null/test_device.py                     |  5 ++---
 test/null/test_winograd.py                   |  8 +-------
 test/speed/external_test_copy_speed.py       | 12 ++++--------
 test/speed/external_test_specific_conv.py    |  4 ++--
 test/speed/external_test_speed_v_torch.py    |  4 ++--
 test/test_tiny.py                            |  5 ++---
 test/testextra/test_bench_log.py             | 12 ++++++------
 test/unit/test_assign.py                     |  3 +--
 test/unit/test_shm_tensor.py                 |  3 +--
 24 files changed, 60 insertions(+), 90 deletions(-)

diff --git a/extra/optimization/test_beam_search.py b/extra/optimization/test_beam_search.py
index a81b1dde55..133c779960 100644
--- a/extra/optimization/test_beam_search.py
+++ b/extra/optimization/test_beam_search.py
@@ -1,7 +1,6 @@
 import unittest
 import numpy as np
 
-from test.helpers import CI
 from tinygrad.helpers import BEAM, Timing, prod
 from tinygrad import Variable, Device, Tensor
 from tinygrad.nn import Conv2d
@@ -65,7 +64,7 @@ class TestBeamSearch(unittest.TestCase):
     actual = a.numpy()
     np.testing.assert_allclose(actual, desired)
 
-  @unittest.skipIf(CI, "flaky. CL_OUT_OF_RESOURCES")
+  @unittest.skip("flaky. CL_OUT_OF_RESOURCES")
   def test_conv_beam(self):
     c = Conv2d(3, 16, (3,3))
     x = rand(1,3,32,32)
diff --git a/test/backend/test_dtype.py b/test/backend/test_dtype.py
index 7baf8c1559..1494c44f01 100644
--- a/test/backend/test_dtype.py
+++ b/test/backend/test_dtype.py
@@ -2,13 +2,13 @@ import contextlib, unittest, math
 import numpy as np
 import torch
 from typing import Any, List
-from tinygrad.helpers import getenv, DEBUG, EMULATED_DTYPES
+from tinygrad.helpers import getenv, DEBUG, EMULATED_DTYPES, DEV
 from tinygrad.dtype import DType, DTYPES_DICT, least_upper_dtype, fp8_to_float, float_to_fp8, _to_np_dtype, _to_torch_dtype, truncate
 from tinygrad.renderer.ptx import PTXRenderer
 from tinygrad.renderer.nir import NIRRenderer
 from tinygrad import Context, Device, Tensor, dtypes
 from hypothesis import given, settings, strategies as strat
-from test.helpers import rand_for_dtype, CI
+from test.helpers import rand_for_dtype
 from test.unit.test_dtype_spec import _assert_eq, core_dtypes, dtype_ints, dtype_floats, FP8E4M3_MAX, FP8E5M2_MAX, FP8E4M3FNUZ_MAX, FP8E5M2FNUZ_MAX
 import pytest
 pytestmark = pytest.mark.filterwarnings("ignore")
@@ -225,7 +225,7 @@ class TestFloatDType(TestDType):
 @unittest.skipUnless(dtypes.double in supported_dtypes, f"no double on {Device.DEFAULT}")
 class TestDoubleDType(TestDType):
   DTYPE = dtypes.double
-  @unittest.skipIf((CI and Device.DEFAULT in {"CUDA", "NV"}) or \
+  @unittest.skipIf((DEV.interface.startswith("MOCK") and Device.DEFAULT in {"CUDA", "NV"}) or \
    isinstance(Device[Device.DEFAULT].renderer, (PTXRenderer, NIRRenderer)), "conversion not supported on CI CUDA, PTX, and NIR")  # TODO: why not?
   def test_float64_increased_precision(self):
     for func in [
diff --git a/test/backend/test_dtype_alu.py b/test/backend/test_dtype_alu.py
index ae12b8810c..05c3074bf4 100644
--- a/test/backend/test_dtype_alu.py
+++ b/test/backend/test_dtype_alu.py
@@ -7,7 +7,6 @@ from tinygrad.runtime.ops_python import from_storage_scalar
 from tinygrad.renderer.ptx import PTXRenderer
 from tinygrad.renderer.nir import NIRRenderer
 from tinygrad.uop import Ops
-from test.helpers import CI
 import numpy as np
 import pytest
 from hypothesis import assume, given, strategies as strat, settings
@@ -331,12 +330,12 @@ class TestDTypeALU(unittest.TestCase):
   @given(ht.bool, ht.bool, strat.sampled_from(((operator.add, operator.add), (operator.mul, operator.mul))))
   def test_bool(self, a, b, op): universal_test(a, b, dtypes.bool, op)
 
-  @unittest.skipIf(not CI and Device.DEFAULT == "METAL", "broken on local M3")
   @given(ht.int32, ht.int32, ht.float32, strat.sampled_from(integer_binary_operations), strat.sampled_from(binary_operations))
   def test_int32_midcast_float(self, a, b, c, op1, op2): universal_test_midcast(a, b, c, op1, op2, dtypes.int32, dtypes.float32)
 
-  # Metal and CUDA and HIP and NIR behave differently than numpy in CI for overflows
-  skip_overflow = (CI and Device.DEFAULT in {"AMD", "NV", "CUDA"}) or isinstance(Device[Device.DEFAULT].renderer, NIRRenderer)
+  # Metal and (MOCK)CUDA and HIP and NIR behave differently than numpy for overflows
+  skip_overflow = ((DEV.interface.startswith("MOCK") and Device.DEFAULT in {"AMD", "NV", "CUDA"})
+                   or isinstance(Device[Device.DEFAULT].renderer, NIRRenderer))
   @given(strat.floats(width=32, min_value=0, max_value=10.0) if skip_overflow else ht.float32,
          strat.floats(width=32, min_value=0, max_value=10.0) if skip_overflow else ht.float32,
          ht.int32, strat.sampled_from(binary_operations), strat.sampled_from(integer_binary_operations))
diff --git a/test/backend/test_interop.py b/test/backend/test_interop.py
index 7593837d7f..18346212cf 100644
--- a/test/backend/test_interop.py
+++ b/test/backend/test_interop.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-import unittest
+import unittest, os
 import torch
 import numpy as np
 
@@ -7,7 +7,6 @@ from tinygrad.helpers import DEV
 from tinygrad.tensor import Tensor
 from tinygrad.device import Device
 from tinygrad.dtype import _from_torch_dtype, _to_torch_dtype
-from test.helpers import CI
 
 MOCKGPU = DEV.interface.startswith("MOCK")
 
@@ -28,7 +27,7 @@ class TestInterop(unittest.TestCase):
     tg_out = tg_data[:, :, 0] * 0.2989 + tg_data[:, :, 1] * 0.5870 + tg_data[:, :, 2] * 0.1140
     tg_res = tg_out.numpy()
 
-    if self.torch_device == "mps" and CI:
+    if self.torch_device == "mps" and os.getenv("CI", "") != "":
       # MPS backend out of memory: https://discuss.pytorch.org/t/mps-back-end-out-of-memory-on-github-action/189773
       # Calculate expected value on cpu.
       inp = inp.cpu()
diff --git a/test/backend/test_ops.py b/test/backend/test_ops.py
index 0f21519637..67d3833700 100644
--- a/test/backend/test_ops.py
+++ b/test/backend/test_ops.py
@@ -1,4 +1,4 @@
-import time, math, unittest, functools, platform, warnings
+import time, math, unittest, functools, platform, warnings, sys
 import numpy as np
 from typing import List, Callable
 import torch
@@ -7,7 +7,6 @@ from tinygrad import Tensor, Device, dtypes
 from tinygrad.tensor import _to_np_dtype
 from tinygrad.renderer.cstyle import QCOMCLRenderer
 from tinygrad.renderer.nir import NIRRenderer
-from test.helpers import CI
 
 TINY_BACKEND = getenv("TINY_BACKEND")
 if TINY_BACKEND:
@@ -74,7 +73,7 @@ def helper_test_op(shps, torch_fxn, tinygrad_fxn=None, atol=1e-6, rtol=1e-3, gra
     for i, (t, torch_grad) in enumerate(zip(tiny_grads, torch_grads)):
       compare(f"backward pass tensor {i}", t.numpy(), torch_grad.detach().cpu().numpy(), atol=grad_atol, rtol=grad_rtol)
 
-  if not CI:
+  if sys.stdout.isatty():
     print("\ntesting %40r   torch/tinygrad fp: %.2f / %.2f ms  bp: %.2f / %.2f ms " % \
           (shps, torch_fp*1000, tinygrad_fp*1000, torch_fbp*1000, tinygrad_fbp*1000), end="")
 
@@ -103,7 +102,7 @@ class TestOps(unittest.TestCase):
     with self.assertRaises(expected) as tinygrad_cm:
       tinygrad_fxn(*tst)
     if exact: self.assertEqual(str(torch_cm.exception), str(tinygrad_cm.exception))
-    if not CI: print("\ntesting %40r   torch/tinygrad exception: %s / %s" % (shps, torch_cm.exception, tinygrad_cm.exception), end="")
+    if sys.stdout.isatty(): print("\ntesting %40r   torch/tinygrad exception: %s / %s" % (shps, torch_cm.exception, tinygrad_cm.exception), end="")
 
   def test_full_like(self):
     a = Tensor([[1,2,3],[4,5,6]], dtype=dtypes.float32)
diff --git a/test/backend/test_profiler.py b/test/backend/test_profiler.py
index 5016bdd5e7..2091f85ac4 100644
--- a/test/backend/test_profiler.py
+++ b/test/backend/test_profiler.py
@@ -5,7 +5,6 @@ from tinygrad.device import Buffer, BufferSpec, Compiled, ProfileDeviceEvent, Pr
 from tinygrad.runtime.support.hcq import HCQCompiled
 from tinygrad.engine.realize import get_runtime
 from tinygrad.codegen import to_program
-from test.helpers import CI
 
 MOCKGPU = DEV.interface.startswith("MOCK")
 def _dev_base(d):
@@ -145,7 +144,8 @@ class TestProfiler(unittest.TestCase):
     assert len(graph_evs) == 2, "2 graph events are expected"
     assert len(graph_evs[0].ents) == 2, "two entities are expected"
 
-  @unittest.skipIf(CI or not issubclass(type(Device[Device.DEFAULT]), HCQCompiled), "skip CI")
+  @unittest.skipIf(MOCKGPU, "skip MOCKGPU")
+  @unittest.skipUnless(issubclass(type(Device[Device.DEFAULT]), HCQCompiled), "must be HCQ")
   def test_dev_jitter_matrix(self):
     dev_cnt = 6
     try: devs = [Device[f"{Device.DEFAULT}:{i}"] for i in range(dev_cnt)]
diff --git a/test/backend/test_randomness.py b/test/backend/test_randomness.py
index a4aad9a826..834eb9ddca 100644
--- a/test/backend/test_randomness.py
+++ b/test/backend/test_randomness.py
@@ -1,14 +1,14 @@
 import unittest, math
 
 from tinygrad import dtypes, Tensor, Device
-from tinygrad.helpers import getenv
+from tinygrad.helpers import getenv, DEV
 from tinygrad.codegen import to_program
 
 from tinygrad.uop.ops import Ops
 from tinygrad.renderer.ptx import PTXRenderer
 from tinygrad.renderer.nir import NIRRenderer
 from tinygrad.renderer.isa.x86 import X86Renderer
-from test.helpers import not_support_multi_device, needs_second_gpu, CI
+from test.helpers import not_support_multi_device, needs_second_gpu
 from test.unit.test_randomness import equal_distribution, normal_test
 
 import numpy as np
@@ -48,7 +48,7 @@ class TestRandomness(unittest.TestCase):
     assert nx[nx == 0].size > 0
     equal_distribution(lambda *x: Tensor.rand(*x, dtype=dtypes.float16), torch.rand, lambda x: np.random.rand(*x), shape=(2, N, N))
 
-  @unittest.skipIf(CI and Device.DEFAULT in {"NV", "CUDA"}, "gpuocelot doesn't support certain ops needed for threefry")
+  @unittest.skipIf(DEV.interface.startswith("MOCK") and Device.DEFAULT in {"NV", "CUDA"}, "gpuocelot doesn't support certain ops needed for threefry")
   def test_threefry_against_reference(self):
     Tensor.manual_seed(1337)
 
diff --git a/test/backend/test_schedule.py b/test/backend/test_schedule.py
index 39bdb21fd3..a754cbdd07 100644
--- a/test/backend/test_schedule.py
+++ b/test/backend/test_schedule.py
@@ -10,9 +10,8 @@ from hypothesis import assume, given, strategies as strat
 from tinygrad import nn, dtypes, Device, Tensor, Variable
 from tinygrad.dtype import DType
 from tinygrad.uop.ops import UOp, Ops, UPat
-from tinygrad.helpers import DEBUG, OSX, GlobalCounters, Context, getenv, all_same, temp
+from tinygrad.helpers import DEBUG, DEV, OSX, GlobalCounters, Context, getenv, all_same, temp
 from tinygrad.engine.realize import compile_linear, run_linear
-from test.helpers import CI
 
 supported_dtypes = Device[Device.DEFAULT].renderer.supported_dtypes()
 
@@ -115,7 +114,6 @@ class TestSchedule(unittest.TestCase):
     run_linear(*check_schedule(b, 1))
     np.testing.assert_allclose(b.numpy(), np.broadcast_to(a.numpy().astype(np.float16), (2, 4, 4))+2, rtol=1e-3)
 
-  @unittest.skipIf(CI and Device.DEFAULT == "NV", "crashes on NV CI")
   def test_add_chain_buffers(self):
     N = 31
     with Context(TRACK_MATCH_STATS=0, DEBUG=0):
@@ -1114,7 +1112,7 @@ class TestSchedule(unittest.TestCase):
       self.assertListEqual(a.tolist(), [[1.]*shape[1]]*shape[0])
 
 class TestLimitBufs(unittest.TestCase):
-  @unittest.skipIf(CI and Device.DEFAULT == "NV", "crashes on NV CI")
+  @unittest.skipIf(DEV.interface.startswith("MOCK") and Device.DEFAULT == "NV", "crashes in ocelot")
   def test_limit_bufs_with_var(self):
     N = 31
     with Context(TRACK_MATCH_STATS=0, DEBUG=0):
diff --git a/test/backend/test_transcendental.py b/test/backend/test_transcendental.py
index 0106ecef19..8602e9f73a 100644
--- a/test/backend/test_transcendental.py
+++ b/test/backend/test_transcendental.py
@@ -2,7 +2,6 @@ import unittest
 from tinygrad import Tensor, Device, dtypes
 from tinygrad.tensor import _to_np_dtype
 from tinygrad.helpers import Context, getenv, DEV, OSX
-from test.helpers import CI
 from test.backend.test_schedule import check_schedule
 from test.backend.test_dtype_alu import ht, dtypes_float
 import numpy as np
@@ -32,7 +31,7 @@ class TestTranscendentalMath(unittest.TestCase):
     ([(Tensor.sin, np.sin)] if dtypes.ulong in supported_dtypes else [])))
   def test_float32(self, x, op):
     # wrong nan behavior on Vulkan
-    if (math.isnan(x) or (x < 0 and op[0] == Tensor.log)) and CI and Device.DEFAULT == "WEBGPU" and not OSX: return
+    if (math.isnan(x) or (x < 0 and op[0] == Tensor.log)) and Device.DEFAULT == "WEBGPU" and not OSX: return
     with Context(TRANSCENDENTAL=2), np.errstate(all='ignore'):
       np.testing.assert_allclose(op[0](Tensor([x], dtype=dtypes.float32)).numpy(),
                                  op[1](np.array([x], dtype=_to_np_dtype(dtypes.float32))),
@@ -43,7 +42,7 @@ class TestTranscendentalMath(unittest.TestCase):
     ([(Tensor.sin, np.sin)] if dtypes.ulong in supported_dtypes else [])))
   def test_float16(self, x, op):
     # wrong nan behavior on Vulkan
-    if (math.isnan(x) or (x < 0 and op[0] == Tensor.log)) and CI and Device.DEFAULT == "WEBGPU" and not OSX: return
+    if (math.isnan(x) or (x < 0 and op[0] == Tensor.log)) and Device.DEFAULT == "WEBGPU" and not OSX: return
     with Context(TRANSCENDENTAL=2), np.errstate(all='ignore'):
       np.testing.assert_allclose(op[0](Tensor([x], dtype=dtypes.float16)).numpy(),
                                  op[1](np.array([x], dtype=_to_np_dtype(dtypes.float16))),
@@ -117,7 +116,7 @@ class TestFloat16Log2(unittest.TestCase):
         np.testing.assert_allclose(result, expected, rtol=1e-3, err_msg=f"log2({val})")
 
   @unittest.skipUnless(dtypes.float16 in supported_dtypes, f"no float16 on {Device.DEFAULT}")
-  @unittest.skipIf(Device.DEFAULT == "WEBGPU" and CI, "Nan handling differs on Vulkan")
+  @unittest.skipIf(Device.DEFAULT == "WEBGPU" and not OSX, "Nan handling differs on Vulkan")
   def test_float16_log2_special(self):
     # special values: inf, -inf, nan, 0, negative
     with Context(TRANSCENDENTAL=2), np.errstate(all='ignore'):
diff --git a/test/backend/test_uops.py b/test/backend/test_uops.py
index 27ab4c3a8e..cda690e26a 100644
--- a/test/backend/test_uops.py
+++ b/test/backend/test_uops.py
@@ -11,7 +11,7 @@ from tinygrad.engine.realize import run_linear
 from tinygrad.codegen import to_program
 from tinygrad.codegen.opt import Opt, OptOps
 from tinygrad.renderer.ptx import PTXRenderer
-from test.helpers import to_uops_list, CI
+from test.helpers import to_uops_list
 
 def run_uops(uops_list:list[UOp], bufs:list[Buffer]):
   buf_uops = [UOp.new_buffer(b.device, b.size, b.dtype) for b in bufs]
@@ -173,8 +173,6 @@ class TestBoolUOps(TestUOps):
   def test_where_bool(self): self._test_top_bool_fxn(Ops.WHERE, lambda a,b,c: b if a else c)
 
 class TestLocalAccess(unittest.TestCase):
-  # NOTE: this is failing on METAL CI, no idea why. Works locally.
-  @unittest.skipIf(Device.DEFAULT == "METAL" and CI, "failing only in CI")
   @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_shared, "test requires shared memory")
   def test_local_basic(self):
     uops = []
diff --git a/test/external/external_test_example.py b/test/external/external_test_example.py
index de51efb1be..37554fc22c 100644
--- a/test/external/external_test_example.py
+++ b/test/external/external_test_example.py
@@ -1,8 +1,7 @@
-import unittest
+import unittest, sys
 from tinygrad import Device
 from tinygrad.tensor import Tensor
 from tinygrad.helpers import getenv, OSX
-from test.helpers import CI
 
 def multidevice_test(fxn):
   exclude_devices = getenv("EXCLUDE_DEVICES", "").split(",")
@@ -10,15 +9,15 @@ def multidevice_test(fxn):
     for device in Device._devices:
       # broken on OSX USB AMD, why?
       if device in ["DISK", "NPY", "FAKE", "DSP", "NULL"] or (OSX and device in ["AMD"]): continue
-      if not CI: print(device)
+      if sys.stdout.isatty(): print(device)
       if device in exclude_devices:
-        if not CI: print(f"WARNING: {device} test is excluded")
+        if sys.stdout.isatty(): print(f"WARNING: {device} test is excluded")
         continue
       with self.subTest(device=device):
         try:
           Device[device]
         except Exception:
-          if not CI: print(f"WARNING: {device} test isn't running")
+          if sys.stdout.isatty(): print(f"WARNING: {device} test isn't running")
           continue
         fxn(self, device)
   return ret
diff --git a/test/external/external_test_hcq.py b/test/external/external_test_hcq.py
index 044928fe33..cb21563ae1 100644
--- a/test/external/external_test_hcq.py
+++ b/test/external/external_test_hcq.py
@@ -1,10 +1,9 @@
 import unittest, ctypes, struct, time, array
 from tinygrad import Device, Tensor, dtypes
-from tinygrad.helpers import to_mv
+from tinygrad.helpers import to_mv, DEV
 from tinygrad.device import Buffer, BufferSpec
 from tinygrad.engine.realize import get_runtime
 from tinygrad.codegen import to_program
-from test.helpers import CI
 
 def _time_queue(q, d):
   st = time.perf_counter()
@@ -149,7 +148,7 @@ class TestHCQ(unittest.TestCase):
     val = TestHCQ.b.uop.buffer.as_memoryview().cast("f")[1]
     assert val == 0.0, f"got val {val}, should not be updated"
 
-  @unittest.skipIf(CI, "Can't handle async update on CPU")
+  @unittest.skipIf(DEV.interface.startswith("MOCK"), "Can't handle async update on CPU")
   def test_wait_signal(self):
     temp_signal = TestHCQ.d0._alloc_signal(value=0)
     TestHCQ.compute_queue().wait(temp_signal, value=1).signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0)
@@ -160,7 +159,7 @@ class TestHCQ(unittest.TestCase):
     TestHCQ.d0._wait_signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value, timeout=100)
     TestHCQ.d0.timeline_value += 1
 
-  @unittest.skipIf(CI, "Can't handle async update on CPU")
+  @unittest.skipIf(DEV.interface.startswith("MOCK"), "Can't handle async update on CPU")
   def test_wait_copy_signal(self):
     temp_signal = TestHCQ.d0._alloc_signal(value=0)
     TestHCQ.copy_queue().wait(temp_signal, value=1).signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0)
diff --git a/test/external/external_test_jit_on_models.py b/test/external/external_test_jit_on_models.py
index 9b1ef6217e..e2d14d9768 100644
--- a/test/external/external_test_jit_on_models.py
+++ b/test/external/external_test_jit_on_models.py
@@ -3,7 +3,7 @@ import unittest
 import numpy as np
 from tinygrad import Tensor, dtypes
 from tinygrad.engine.jit import TinyJit
-from test.helpers import derandomize_model, CI
+from test.helpers import derandomize_model
 
 from examples.llama import Transformer
 
@@ -27,7 +27,6 @@ class TestJittedModels(unittest.TestCase):
     helper_test_jitted_correctness(lambda: (Tensor([[1,]]),), test, test_jit)
     dtypes.default_float = old_float
 
-  @unittest.skipUnless(not CI, "huge for CI")
   def test_jitted_stable_diffusion(self):
     from examples.stable_diffusion import UNetModel, unet_params
     model = UNetModel(**unet_params)
diff --git a/test/helpers.py b/test/helpers.py
index fa26cac0e6..3ce7c1c8a1 100644
--- a/test/helpers.py
+++ b/test/helpers.py
@@ -8,14 +8,11 @@ from tinygrad.tensor import _to_np_dtype
 from tinygrad.codegen import to_program
 from tinygrad.dtype import DType
 from tinygrad.nn.state import get_parameters
-from tinygrad.helpers import T, Target
+from tinygrad.helpers import T, Target, DEV
 from tinygrad.renderer import Renderer
 from tinygrad.codegen import full_rewrite_to_sink, line_rewrite, pm_linearize_cleanups
 from tinygrad.codegen.late.linearizer import linearize
 
-# TODO: remove this everywhere!
-CI = os.getenv("CI", "") != ""
-
 # decorator to skip slow tests by default, run with RUN_SLOW=1 to include them
 slow = unittest.skipUnless(os.getenv("RUN_SLOW"), "slow test, set RUN_SLOW=1 to run")
 from tinygrad.runtime.ops_python import PythonProgram, PythonRenderer, PythonCompiler
@@ -100,7 +97,7 @@ def to_uops_list(u:list[UOp], ren=None) -> list[UOp]:
 
 def not_support_multi_device():
   # CL and CUDA don't support multi device if in CI
-  return CI and Device.DEFAULT in ("CL", "CUDA")
+  return (Device.DEFAULT == "CL" and Device[Device.DEFAULT].count() < 2) or (Device.DEFAULT == "CUDA" and DEV.interface.startswith("MOCK"))
 
 def needs_second_gpu(fn):
   @functools.wraps(fn)
diff --git a/test/models/test_end2end.py b/test/models/test_end2end.py
index b7742dde38..452e0d0704 100644
--- a/test/models/test_end2end.py
+++ b/test/models/test_end2end.py
@@ -1,19 +1,18 @@
 import torch
 from torch import nn
-import unittest
+import unittest, sys
 import numpy as np
 from tinygrad.nn.state import get_parameters, get_state_dict
 from tinygrad.nn import optim, Linear, Conv2d, BatchNorm2d
 from tinygrad.tensor import Tensor
 from extra.datasets import fetch_mnist
-from test.helpers import CI
 
 def compare_tiny_torch(model, model_torch, X, Y):
   with Tensor.train():
     model_torch.train()
     model_state_dict = get_state_dict(model)
     for k,v in model_torch.named_parameters():
-      if not CI: print(f"initting {k} from torch")
+      if sys.stdout.isatty(): print(f"initting {k} from torch")
       model_state_dict[k].assign(Tensor(v.detach().numpy())).realize()
 
     optimizer = optim.SGD(get_parameters(model), lr=0.001)
@@ -35,14 +34,14 @@ def compare_tiny_torch(model, model_torch, X, Y):
     loss_torch.backward()
 
     # assert losses match
-    if not CI: print(loss.realize().numpy())
-    if not CI: print(loss_torch.detach().numpy())
+    if sys.stdout.isatty(): print(loss.realize().numpy())
+    if sys.stdout.isatty(): print(loss_torch.detach().numpy())
     np.testing.assert_allclose(loss.realize().numpy(), loss_torch.detach().numpy(), atol=1e-4)
 
     for k,v in list(model_torch.named_parameters())[::-1]:
       g = model_state_dict[k].grad.numpy()
       gt = v.grad.detach().numpy()
-      if not CI: print("testing grads", k, model_state_dict[k].grad.dtype)
+      if sys.stdout.isatty(): print("testing grads", k, model_state_dict[k].grad.dtype)
       np.testing.assert_allclose(g, gt, atol=1e-3, err_msg=f'grad mismatch {k}')
 
     # take the steps
@@ -51,7 +50,7 @@ def compare_tiny_torch(model, model_torch, X, Y):
 
     # assert weights match
     for k,v in model_torch.named_parameters():
-      if not CI: print("testing weight", k, model_state_dict[k].dtype)
+      if sys.stdout.isatty(): print("testing weight", k, model_state_dict[k].dtype)
       np.testing.assert_allclose(model_state_dict[k].numpy(), v.detach().numpy(), atol=1e-3, err_msg=f'weight mismatch {k}')
 
 def get_mnist_data():
diff --git a/test/null/test_device.py b/test/null/test_device.py
index e08b37866e..b31be94173 100644
--- a/test/null/test_device.py
+++ b/test/null/test_device.py
@@ -5,7 +5,6 @@ from tinygrad import Tensor
 from tinygrad.device import Device, Compiler, enumerate_devices_str
 from tinygrad.helpers import diskcache_get, diskcache_put, getenv, Context, Target, WIN, OSX, DEV
 from tinygrad.runtime.support.c import DLL
-from test.helpers import CI
 
 class TestDevice(unittest.TestCase):
   def test_canonicalize(self):
@@ -67,7 +66,7 @@ class TestDevice(unittest.TestCase):
     self.assertNotEqual(result.returncode, 0)
     self.assertIn(b"deprecated", result.stderr)
 
-  @unittest.skipIf(WIN and CI, "skipping windows test") # TODO: subprocess causes memory violation?
+  @unittest.skipIf(WIN, "skipping windows test") # TODO: subprocess causes memory violation?
   def test_env_overwrite_default_compiler(self):
     if Device.DEFAULT == "CPU":
       from tinygrad.runtime.support.compiler_cpu import CPULLVMCompiler, ClangJITCompiler
@@ -95,7 +94,7 @@ class TestDevice(unittest.TestCase):
                         shell=True, check=True, env={**os.environ, "DEV": "AMD:HIP"})
     else: self.skipTest("only run on CPU/AMD")
 
-  @unittest.skipIf(WIN and CI, "skipping windows test")
+  @unittest.skipIf(WIN, "skipping windows test")
   def test_env_online(self):
     from tinygrad.runtime.support.compiler_cpu import CPULLVMCompiler, ClangJITCompiler
     try: _, _ = CPULLVMCompiler(), ClangJITCompiler()
diff --git a/test/null/test_winograd.py b/test/null/test_winograd.py
index 98d137f733..e9ac04b8be 100644
--- a/test/null/test_winograd.py
+++ b/test/null/test_winograd.py
@@ -1,7 +1,6 @@
 import unittest, sys
 from tinygrad import Tensor, GlobalCounters, dtypes, Context
-from tinygrad.helpers import Profiling, WINO
-from test.helpers import CI
+from tinygrad.helpers import WINO
 
 @unittest.skipIf(sys.platform.startswith("win"), "flaky on Windows")
 class TestWinograd(unittest.TestCase):
@@ -11,11 +10,6 @@ class TestWinograd(unittest.TestCase):
   def tearDown(self):
     WINO.value = self.old
 
-  def test_profile(self):
-    x,w = Tensor.rand(1,4,9,9).realize(), Tensor.rand(4,4,3,3).realize()
-    with Profiling(enabled=not CI, sort='time'):
-      Tensor.conv2d(x,w).realize()
-
   def test_forward_kernels(self):
     x,w = Tensor.rand(1,4,9,9).realize(), Tensor.rand(4,4,3,3).realize()
     out = Tensor.conv2d(x,w)
diff --git a/test/speed/external_test_copy_speed.py b/test/speed/external_test_copy_speed.py
index 6ab3b1df50..e34b0fc1fe 100644
--- a/test/speed/external_test_copy_speed.py
+++ b/test/speed/external_test_copy_speed.py
@@ -1,7 +1,6 @@
-import unittest, numpy as np
+import unittest, numpy as np, os
 from tinygrad import Tensor, Device, TinyJit
-from tinygrad.helpers import Timing, OSX, getenv
-from test.helpers import CI
+from tinygrad.helpers import Timing, getenv
 import multiprocessing.shared_memory as shared_memory
 
 N = getenv("NSZ", 256)
@@ -12,7 +11,7 @@ class TestCopySpeed(unittest.TestCase):
   def testCopySHMtoDefault(self):
     s = shared_memory.SharedMemory(name="test_X", create=True, size=N*N*4)
     s.close()
-    if CI and not OSX:
+    if os.path.exists("/dev/shm"):
       t = Tensor.empty(N, N, device="disk:/dev/shm/test_X").realize()
     else:
       t = Tensor.empty(N, N, device="disk:shm:test_X").realize()
@@ -77,11 +76,8 @@ class TestCopySpeed(unittest.TestCase):
         Device[Device.DEFAULT].synchronize()
       np.testing.assert_equal(t.numpy(), x.numpy())
 
-  @unittest.skipIf(CI, "CI doesn't have 6 GPUs")
-  @unittest.skipIf(Device.DEFAULT != "CL", "only test this on CL")
+  @unittest.skipIf(Device.DEFAULT != "CL" or Device[Device.DEFAULT].count() != 6, "only test this on CL, with 6 gpus")
   def testCopyCPUto6GPUs(self):
-    from tinygrad.runtime.ops_cl import CLDevice
-    if len(CLDevice.device_ids) != 6: raise unittest.SkipTest("computer doesn't have 6 GPUs")
     t = Tensor.ones(N, N, device="CPU").contiguous().realize()
     print(f"buffer: {t.nbytes()*1e-9:.2f} GB")
     for _ in range(3):
diff --git a/test/speed/external_test_specific_conv.py b/test/speed/external_test_specific_conv.py
index 6533279cf5..190a1ccee9 100644
--- a/test/speed/external_test_specific_conv.py
+++ b/test/speed/external_test_specific_conv.py
@@ -1,9 +1,9 @@
 import unittest
 from tinygrad import Tensor, Device, dtypes
-from test.helpers import CI
+from tinygrad.helpers import DEV
 # similar to test/external/external_test_gpu_ast.py, but universal
 
-@unittest.skipIf(Device.DEFAULT in {"CUDA", "NV"} and CI, "slow on CUDA CI")
+@unittest.skipIf(Device.DEFAULT in {"CUDA", "NV"} and DEV.interface.startswith("MOCK"), "slow on ocelot")
 class TestSpecific(unittest.TestCase):
   # from openpilot
 
diff --git a/test/speed/external_test_speed_v_torch.py b/test/speed/external_test_speed_v_torch.py
index 1c72ba8900..bd3520c056 100644
--- a/test/speed/external_test_speed_v_torch.py
+++ b/test/speed/external_test_speed_v_torch.py
@@ -9,11 +9,11 @@ import torch
 torch.set_num_threads(1)
 import time
 import numpy as np
+import sys
 np.set_printoptions(linewidth=160)
 from tinygrad import Tensor, Device, GlobalCounters, TinyJit
 from tinygrad.nn import Conv2d
 from tinygrad.helpers import colorize_float, getenv, DEV
-from test.helpers import CI
 
 IN_CHANS = [int(x) for x in getenv("IN_CHANS", "4,16,64").split(",")]
 
@@ -96,7 +96,7 @@ def helper_test_generic(name, f1, f1_args, f2, f2_args):
   desc = "faster" if et_torch > et_tinygrad else "slower"
   flops = save_ops*1e-6
   mem = save_mem*1e-6
-  print(("\r" if not CI else "")+f"{name:42s} {et_torch:7.2f} ms ({flops/et_torch:9.2f} GFLOPS {mem/et_torch:7.2f} GB/s) in torch, {et_tinygrad:7.2f} ms ({flops/et_tinygrad:9.2f} GFLOPS {mem/et_tinygrad:7.2f} GB/s) in tinygrad, {colorize_float(et_tinygrad/et_torch)} {desc} {flops:10.2f} MOPS {mem:8.2f} MB")  # noqa: E501
+  print(("\r" if sys.stdout.isatty() else "")+f"{name:42s} {et_torch:7.2f} ms ({flops/et_torch:9.2f} GFLOPS {mem/et_torch:7.2f} GB/s) in torch, {et_tinygrad:7.2f} ms ({flops/et_tinygrad:9.2f} GFLOPS {mem/et_tinygrad:7.2f} GB/s) in tinygrad, {colorize_float(et_tinygrad/et_torch)} {desc} {flops:10.2f} MOPS {mem:8.2f} MB")  # noqa: E501
   atol, rtol = (1e-2, 1e-2) if torch_dt == torch.float16 else (1e-3, 1e-3)
   np.testing.assert_allclose(val_tinygrad, val_torch, atol=atol, rtol=rtol)
 
diff --git a/test/test_tiny.py b/test/test_tiny.py
index 71e2dfdb96..10ad7428df 100644
--- a/test/test_tiny.py
+++ b/test/test_tiny.py
@@ -2,7 +2,6 @@
 import unittest, random
 from tinygrad import Tensor, Context, Variable, TinyJit, dtypes, Device, nn
 from tinygrad.helpers import getenv
-from test.helpers import CI
 
 class TestTiny(unittest.TestCase):
 
@@ -112,7 +111,7 @@ class TestTiny(unittest.TestCase):
   # *** a model ***
 
   # TODO: this is failing because of how swizzling rewrites the ShapeTracker of the final STORE
-  @unittest.skipIf(CI and Device.DEFAULT == "DSP", "failing because of make things that can't be images not images")
+  @unittest.skipIf(Device.DEFAULT == "DSP", "failing because of make things that can't be images not images")
   def test_mnist(self):
     layers = [
       nn.Conv2d(1, 32, 5), Tensor.relu,
@@ -131,7 +130,7 @@ class TestTiny(unittest.TestCase):
     self.assertEqual(len(probs[0]), 10)
 
   # TODO: this is failing because of how swizzling rewrites the ShapeTracker of the final STORE
-  @unittest.skipIf(CI and Device.DEFAULT == "DSP", "failing because of make things that can't be images not images")
+  @unittest.skipIf(Device.DEFAULT == "DSP", "failing because of make things that can't be images not images")
   def test_mnist_backward(self):
     # NOTE: we don't have the whole model here for speed
     layers = [
diff --git a/test/testextra/test_bench_log.py b/test/testextra/test_bench_log.py
index fede0e028b..975bef2fe1 100644
--- a/test/testextra/test_bench_log.py
+++ b/test/testextra/test_bench_log.py
@@ -2,12 +2,12 @@ import unittest, time
 from unittest.case import skipIf
 
 from extra.bench_log import BenchEvent, InstantBenchEvent, WallTimeEvent, KernelTimeEvent, log_event_instant, _events, clear_events
-from tinygrad.helpers import Context
+from tinygrad.helpers import Context, DEV
 from tinygrad.tensor import Tensor
 from tinygrad.device import Device
-from test.helpers import CI
 
-_SKIP_KERNEL_TIMING = Device.DEFAULT == "WEBGPU"  # WEBGPU kernel timing not supported
+# WEBGPU kernel timing not supported, ocelot CUDA is inaccurate
+_SKIP_KERNEL_TIMING = Device.DEFAULT == "WEBGPU" or (Device.DEFAULT == "CUDA" and DEV.interface.startswith("MOCK"))
 
 class TestBenchLog(unittest.TestCase):
   def setUp(self):
@@ -38,7 +38,7 @@ class TestBenchLog(unittest.TestCase):
       self.assertGreater(_events[event]["wall"][0], 0)
       self.assertGreater(_events[event]["wall"][1], 0)
 
-  @skipIf(CI or _SKIP_KERNEL_TIMING, "ci timing is not accurate")
+  @skipIf(_SKIP_KERNEL_TIMING, "ci timing is not accurate")
   def test_log_single_kernel_time(self):
     wall_times = []
 
@@ -55,7 +55,7 @@ class TestBenchLog(unittest.TestCase):
       self.assertLess(_events[event]["kernel"][0], wall_times[0])
       self.assertGreater(_events[event]["kernel"][0], 0)
 
-  @skipIf((CI and Device.DEFAULT == "CUDA") or _SKIP_KERNEL_TIMING, "ci cuda timing is not accurate")
+  @skipIf(_SKIP_KERNEL_TIMING, "ci cuda timing is not accurate")
   def test_interleaved_wall_kernel_time(self):
     wall_times = []
     with Context(DEBUG=2):
@@ -77,7 +77,7 @@ class TestBenchLog(unittest.TestCase):
       self.assertLess(_events[event]["kernel"][0], wall_times[0])
       self.assertGreater(_events[event]["kernel"][0], 0)
 
-  @skipIf((CI and Device.DEFAULT == "CUDA") or _SKIP_KERNEL_TIMING, "ci cuda timing is not accurate")
+  @skipIf(_SKIP_KERNEL_TIMING, "ci cuda timing is not accurate")
   def test_stacked_wall_kernel_time(self):
     with Context(DEBUG=2):
       for event in BenchEvent:
diff --git a/test/unit/test_assign.py b/test/unit/test_assign.py
index 4aec79b4aa..5e5b1ff16a 100644
--- a/test/unit/test_assign.py
+++ b/test/unit/test_assign.py
@@ -4,7 +4,6 @@ import numpy as np
 from tinygrad import dtypes, Tensor, TinyJit, GlobalCounters, Variable
 from tinygrad.uop.ops import Ops, UOp
 from tinygrad.helpers import temp, DEV, Context
-from test.helpers import CI
 
 N = 200  # has to be bigger than the cache to fail
 
@@ -189,7 +188,7 @@ class TestAssign(unittest.TestCase):
     new = a + times_a
     np.testing.assert_allclose(new.numpy(), 8)
 
-  @unittest.skipIf(CI and DEV.renderer == "LVP", "flaky in CI")
+  @unittest.skipIf(DEV.renderer == "LVP", "flaky in CI")
   def test_double_assign(self):
     a = Tensor.ones(4).contiguous().realize()
     a += 1
diff --git a/test/unit/test_shm_tensor.py b/test/unit/test_shm_tensor.py
index 69a6746e28..19b6c9ff9d 100644
--- a/test/unit/test_shm_tensor.py
+++ b/test/unit/test_shm_tensor.py
@@ -2,11 +2,10 @@ import unittest
 import multiprocessing.shared_memory as shared_memory
 from tinygrad.helpers import WIN
 from tinygrad import Tensor, Device
-from test.helpers import CI
 import numpy as np
 
 class TestRawShmBuffer(unittest.TestCase):
-  @unittest.skipIf(WIN and CI, "only fails on CI windows instance")
+  @unittest.skipIf(WIN, "only fails on CI windows instance")
   def test_e2e(self):
     t = Tensor.randn(2, 2, 2).realize()