diff --git a/extra/bandwidth_test.py b/extra/bandwidth_test.py
new file mode 100644
index 0000000000..7e32ad8d75
--- /dev/null
+++ b/extra/bandwidth_test.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python3
+from tinygrad import Tensor, Device, GlobalCounters, Context, dtypes
+from tinygrad.helpers import getenv, colored
+
+SZ = 8_000_000_000
+GPUS = getenv("GPUS", 4) # TODO: expose a way in tinygrad to access this
+
+if __name__ == "__main__":
+  # create tensors
+  tens = [Tensor.ones(SZ, dtype=dtypes.uint8, device=f"{Device.DEFAULT}:{i}").contiguous() for i in range(GPUS)]
+  Tensor.realize(*tens)
+
+  bw = [[0.0]*GPUS for _ in range(GPUS)]
+  for i in range(GPUS):
+    for j in range(GPUS):
+      GlobalCounters.reset()
+      with Context(DEBUG=2):
+        if i == j:
+          # this copy would be optimized out, just add 1
+          (tens[i]+1).realize()
+        else:
+          tens[i].to(f"{Device.DEFAULT}:{j}").realize()
+      t = max(GlobalCounters.time_sum_s, 1e-9)
+      bw[i][j] = SZ / t / 1e9  # GB/s
+
+  def fmt(x):
+    c = "green" if x > 50 else "yellow" if x > 20 else "red"
+    return colored(f"{x:6.1f}", c)
+
+  # header
+  print(" " * 8 + " ".join(f"{'d'+str(j):>6}" for j in range(GPUS)))
+  # rows
+  for i in range(GPUS):
+    print(f"{'s'+str(i):>6} -> " + " ".join(fmt(x) for x in bw[i]))
diff --git a/extra/gemm/torch_gemm.py b/extra/gemm/torch_gemm.py
index aee1a8e6aa..da7d2bcd0e 100644
--- a/extra/gemm/torch_gemm.py
+++ b/extra/gemm/torch_gemm.py
@@ -12,7 +12,7 @@ MPS = getenv("MPS", 0)
 if getenv("FP16_ACC"): torch.backends.cuda.matmul.allow_fp16_accumulation = True
 
 for dtype in [torch.float32, torch.float16, torch.bfloat16]:
-  for N in [256, 512, 1024, 2048, 4096]:
+  for N in [256, 512, 1024, 2048, 4096] + ([6144, 8192] if getenv("BIG") else []):
     FLOPS = N*N*N*2
 
     b = torch.rand((N,N), dtype=dtype)
diff --git a/extra/gpuburn.py b/extra/gpuburn.py
new file mode 100644
index 0000000000..2066773554
--- /dev/null
+++ b/extra/gpuburn.py
@@ -0,0 +1,16 @@
+from tinygrad import Tensor, Device, TinyJit, dtypes
+from tinygrad.helpers import getenv
+
+GPUS = getenv("GPUS", 4) # TODO: expose a way in tinygrad to access this
+N = 6144
+
+@TinyJit
+def many_matmul(A, B):
+  out = A
+  for _ in range(8): out = out@B
+  return out
+
+if __name__ == "__main__":
+  A = Tensor.ones(GPUS, N, N, dtype=dtypes.half).shard(devices=tuple([f"{Device.DEFAULT}:{i}" for i in range(GPUS)]), axis=0).contiguous()
+  B = Tensor.ones(GPUS, N, N, dtype=dtypes.half).shard(devices=tuple([f"{Device.DEFAULT}:{i}" for i in range(GPUS)]), axis=0).contiguous()
+  while 1: many_matmul(A, B)
diff --git a/extra/mmapeak/mmapeak.py b/extra/mmapeak/mmapeak.py
index 36c2cf842c..36e24fd372 100644
--- a/extra/mmapeak/mmapeak.py
+++ b/extra/mmapeak/mmapeak.py
@@ -32,7 +32,7 @@ def launchBenchmark(instruction, vgprIndices, dense=True, accum=False, extra="")
   src = src.replace("DIRECTIVE", DIRECTIVE)
   lib = COMPILER.compile(src)
   fxn = AMDProgram(DEV, "matmul", lib)
-  elapsed = fxn(global_size=(NUM_WORKGROUPS,1,1), local_size=(WAVE_SIZE*NUM_WAVES,1,1), wait=True)
+  elapsed = min([fxn(global_size=(NUM_WORKGROUPS,1,1), local_size=(WAVE_SIZE*NUM_WAVES,1,1), wait=True) for _ in range(2)])
   FLOPs = FLOPS_PER_MATMUL * NUM_WAVES * NUM_WORKGROUPS * INTERNAL_LOOP * INSTRUCTIONS_PER_LOOP
   print(f"{instruction:<29} : {FLOPs/elapsed/10**12:.2f} T(FL)OPS")
 
diff --git a/extra/mmapeak/template.s b/extra/mmapeak/template.s
index ecf0704e2b..b84aba74f3 100644
--- a/extra/mmapeak/template.s
+++ b/extra/mmapeak/template.s
@@ -3,14 +3,14 @@
     .p2align 8
     .type matmul,@function
 matmul:
-    s_mov_b32 s1, INTERNAL_LOOP
-    s_mov_b32 s2, 0
-    inner_loop:
-        INSTRUCTION
-        s_sub_u32 s1, s1, 1
-        s_cmp_lg_i32 s1, s2
-        s_cbranch_scc1 inner_loop
-    s_endpgm
+  s_mov_b32 s1, INTERNAL_LOOP
+  s_mov_b32 s2, 0
+  inner_loop:
+    INSTRUCTION
+    s_sub_u32 s1, s1, 1
+    s_cmp_lg_i32 s1, s2
+    s_cbranch_scc1 inner_loop
+  s_endpgm
 
 .rodata
 .p2align 6