diff --git a/accel/lazy/ops_lazy.py b/accel/lazy/ops_lazy.py
index 683d15dab9..145023c2b4 100644
--- a/accel/lazy/ops_lazy.py
+++ b/accel/lazy/ops_lazy.py
@@ -18,6 +18,7 @@ REMOVE_MOVEMENT_NOPS = True
 MERGE_ELEMENTWISE_OPS = True
 MERGE_ELEMENTWISE_INTO_CONV_OUTPUT = True
 FOLD_CONSTANTS_INTO_KERNELS = True
+CACHE_LAZYBUFFERS = False    # this leaks tons of memory. TODO: only cache unresolved LazyBuffers
 
 class LazyOp(NamedTuple):
   op: Op
@@ -84,10 +85,10 @@ class LazyBuffer:
   def unary_op(x, op): return elementwise_op(op, (x,))
   def binary_op(x, op, y:LazyBuffer): return elementwise_op(op, (x,y))
 
-  @functools.lru_cache(maxsize=None)
-  def contiguous_op(x): return x if x.st.contiguous else LazyBuffer(x.shape, LoadOps, LazyOp(LoadOps.CONTIGUOUS, (x,)))
+  @functools.lru_cache(maxsize=None if CACHE_LAZYBUFFERS else 0)
+  def contiguous_op(x) -> LazyBuffer: return x if x.st.contiguous else LazyBuffer(x.shape, LoadOps, LazyOp(LoadOps.CONTIGUOUS, (x,)))
 
-  @functools.lru_cache(maxsize=None)
+  @functools.lru_cache(maxsize=None if CACHE_LAZYBUFFERS else 0)
   def movement_op(x, op:MovementOps, arg) -> LazyBuffer:
     if SHUFFLE_MOVEMENT_OPS and x.optype == BinaryOps:
       # if this MovementOp is being applied to a BinaryOp, apply the MovementOp to all the BinaryOp inputs instead
@@ -126,7 +127,7 @@ def ast(x: Union[LazyBuffer, LazyOp], lazy_srcs: Dict[LazyBuffer, str]) -> str:
   return ast_op(x.op, [ast(src, lazy_srcs) for src in x.src])
 
 # this is needed to reduce convs from 186 -> 174
-@functools.lru_cache(maxsize=None)
+@functools.lru_cache(maxsize=None if CACHE_LAZYBUFFERS else 0)
 def elementwise_op(op, srcs:Tuple[LazyBuffer]) -> LazyBuffer:
   out_shape = srcs[0].shape
 
diff --git a/examples/benchmark_train_efficientnet.py b/examples/benchmark_train_efficientnet.py
new file mode 100644
index 0000000000..e5acf78da1
--- /dev/null
+++ b/examples/benchmark_train_efficientnet.py
@@ -0,0 +1,47 @@
+#!/usr/bin/env python3
+import os
+import time
+from tqdm import trange
+from extra.utils import get_parameters
+from models.efficientnet import EfficientNet
+import tinygrad.optim as optim
+from tinygrad.tensor import Tensor
+
+from test.test_gc import tensors_allocated
+
+import pynvml
+pynvml.nvmlInit()
+handle = pynvml.nvmlDeviceGetHandleByIndex(0)
+
+NUM = int(os.getenv("NUM", 2))
+BS = int(os.getenv("BS", 8))
+CNT = int(os.getenv("CNT", 10))
+
+if __name__ == "__main__":
+  print(f"NUM:{NUM} BS:{BS} CNT:{CNT}")
+  model = EfficientNet(NUM, classes=1000, has_se=False)
+  parameters = get_parameters(model)
+  optimizer = optim.Adam(parameters, lr=0.001)
+
+  Tensor.training = True
+  for i in trange(CNT):
+    x_train = Tensor.randn(BS, 3, 224, 224, requires_grad=False)
+    y_train = Tensor.randn(BS, 1000, requires_grad=False)
+
+    st = time.monotonic()
+    out = model.forward(x_train)
+    loss = out.logsoftmax().mul(y_train).mean()
+    optimizer.zero_grad()
+    loss.backward()
+    et = time.monotonic()
+
+    loss = loss.cpu().data[0]
+    info = pynvml.nvmlDeviceGetMemoryInfo(handle)
+    print(f"{(et-st)*1000.0:7.2f} ms, {loss:7.2f} loss, {tensors_allocated():4d} tensors, {info.used/1e9:.2f} GB used")
+
+
+
+
+
+
+