diff --git a/examples/hlb_cifar10.py b/examples/hlb_cifar10.py
index 6a5dab21ed..7f2fc06a77 100644
--- a/examples/hlb_cifar10.py
+++ b/examples/hlb_cifar10.py
@@ -101,11 +101,11 @@ def train_cifar():
   # 4.7 seconds for float32 w/o channels last. 24 TFLOPS. we get 50ms then i'll be happy. only 64x off
 
   # https://www.anandtech.com/show/16727/nvidia-announces-geforce-rtx-3080-ti-3070-ti-upgraded-cards-coming-in-june
-  # 136 TFLOPS is the theoretical max w float16 on 3080TI
+  # 136 TFLOPS is the theoretical max w float16 on 3080 Ti
 
+  BS = int(os.getenv("BS", "512"))
   for i in range(10):
-    # TODO: the real batch size is 512
-    X, Y = fetch_batch(X_train, Y_train, BS=5)
+    X, Y = fetch_batch(X_train, Y_train, BS=BS)
     CL.time_sum, CL.kernel_count = 0, -1
     CL.ops_sum = 0  # TODO: this should be GlobalCounters.global_ops
     st = time.monotonic()
diff --git a/extra/kernel_search.py b/extra/kernel_search.py
index 433ae08e85..bcbe57b6d9 100644
--- a/extra/kernel_search.py
+++ b/extra/kernel_search.py
@@ -17,8 +17,8 @@ intervention_cache = dbm.open('/tmp/kopt.db', 'c')
 Interventions = Enum("Interventions", ["SWAP", "UPCAST", "SHIFT", "REDUCE"])
 def get_interventions(k):
   p1, p2, p3, p4, p5 = [], [], [], [], []
-  #p1 = [(Interventions.SWAP, x) for x in itertools.combinations(range(k.first_reduce), 2)]
-  #p2 = [(Interventions.SWAP, x) for x in itertools.combinations(range(k.first_reduce, k.shape_len), 2)]
+  p1 = [(Interventions.SWAP, x) for x in itertools.combinations(range(k.first_reduce), 2)]
+  p2 = [(Interventions.SWAP, x) for x in itertools.combinations(range(k.first_reduce, k.shape_len), 2)]
   p3 = [(Interventions.UPCAST, None)] if max(st.shape[-1] for st in k.sts) < 32 else []
   for up_axis in range(k.shape_len):
     max_up = max(st.shape[up_axis] for st in k.sts)
@@ -80,27 +80,27 @@ def run_and_time(k):
     ret.append(t4-t1)
   return min(ret)
 
-def search_one(ast, winning_interventions=[]):
+def search_one(ast, winning_interventions=[], debug=False):
   k = CLASTKernel(ast)
   for w in winning_interventions: apply_intervention(k, *w)
   ints = get_interventions(k)
   options = [(run_and_time(k), None, 0.9)]
   name = k.fxn.name
-  #print(f"{options[-1][1]} : {options[-1][0]*1e-3:.2f}")
+  if debug: print(f"{options[-1][1]} : {options[-1][0]*1e-3:.2f}")
   for int in ints:
     try:
       k = CLASTKernel(ast)
       for w in winning_interventions: apply_intervention(k, *w)
       apply_intervention(k, *int)
       options.append((run_and_time(k), int, 1.0))
-      #print(f"{options[-1][1]} : {options[-1][0]*1e-3:.2f}")
+      if debug: print(f"{options[-1][1]} : {options[-1][0]*1e-3:.2f}")
     except Exception:
-      #print(int, "FAILED")
+      if debug: print(int, "FAILED")
       pass
   baseline = options[0]
   options = sorted(options, key=lambda x: x[0]*x[2])
   best = options[0]
-  print(f"{name:30s} {baseline[0]/1e3:8.2f} us -> {best[0]/1e3:8.2f} us {baseline[0]/best[0]:7.2f}x *with* {winning_interventions} + {best[1]}")
+  print(f"{name:30s} {baseline[0]/1e3:9.2f} us -> {best[0]/1e3:9.2f} us {baseline[0]/best[0]:7.2f}x *with* {winning_interventions} + {best[1]}")
   return best
 
 def apply_optimization(k, ast, max_interventions=1, cache=True):
@@ -123,7 +123,7 @@ def search(ast):
   winning_interventions = []
   for i in range(10):
     print(winning_interventions)
-    oo = search_one(ast, winning_interventions)
+    oo = search_one(ast, winning_interventions, True)
     print(oo)
     if oo[1] is None: break
     winning_interventions.append(oo[1])
@@ -209,6 +209,12 @@ if __name__ == "__main__":
     buf1 = GPUBuffer(shape=ShapeTracker(shape=(32, 1, 1, 1), views=[View((32, 1, 1, 1), (0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([9.964923e-06], dtype=np.float32)))
     op1 = LazyOp(BinaryOps.MUL, (op0,buf1,), None)
     ast = LazyOp(MovementOps.RESHAPE, (op1,), (1, 32, 1, 1))
+  elif int(os.getenv("CONVW", "0")):
+    buf0 = GPUBuffer(shape=ShapeTracker(shape=(64, 1, 128, 3, 3, 512, 32, 32), views=[View((64, 512, 34, 34), (1024, 65536, 32, 1), -33), ZeroView((64, 512, 32, 32), ((0, 64), (0, 512), (-1, 33), (-1, 33))), View((64, 1, 128, 3, 3, 512, 32, 32), (591872, 591872, 0, 34, 1, 1156, 34, 1), 0)]), hostbuf=GPUBuffer(shape=(512, 64, 32, 32), force_create=True))
+    buf1 = GPUBuffer(shape=ShapeTracker(shape=(64, 1, 128, 3, 3, 512, 32, 32), views=[View((64, 1, 128, 3, 3, 512, 32, 32), (0, 0, 1024, 0, 0, 131072, 32, 1), 0)]), hostbuf=GPUBuffer(shape=(512, 128, 32, 32), force_create=True))
+    op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
+    op1 = LazyOp(ReduceOps.SUM, (op0,), (64, 1, 128, 3, 3, 1, 1, 1))
+    ast = LazyOp(MovementOps.RESHAPE, (op1,), (64, 128, 3, 3))
   elif int(os.getenv("BC", "0")):
     # big conv
     buf0 = GPUBuffer(shape=ShapeTracker(shape=(8, 1, 32, 112, 112, 3, 3, 3), views=[View((8, 3, 225, 225), (150528, 50176, 224, 1), 0), ZeroView((8, 3, 224, 224), ((0, 8), (0, 3), (0, 225), (0, 225))), View((8, 1, 32, 112, 112, 3, 3, 3), (151875, 151875, 0, 450, 2, 50625, 225, 1), 0)]), hostbuf=GPUBuffer(shape=(8, 3, 224, 224), force_create=True))