diff --git a/accel/lazy/ops_lazy.py b/accel/lazy/ops_lazy.py index 683d15dab9..145023c2b4 100644 --- a/accel/lazy/ops_lazy.py +++ b/accel/lazy/ops_lazy.py @@ -18,6 +18,7 @@ REMOVE_MOVEMENT_NOPS = True MERGE_ELEMENTWISE_OPS = True MERGE_ELEMENTWISE_INTO_CONV_OUTPUT = True FOLD_CONSTANTS_INTO_KERNELS = True +CACHE_LAZYBUFFERS = False # this leaks tons of memory. TODO: only cache unresolved LazyBuffers class LazyOp(NamedTuple): op: Op @@ -84,10 +85,10 @@ class LazyBuffer: def unary_op(x, op): return elementwise_op(op, (x,)) def binary_op(x, op, y:LazyBuffer): return elementwise_op(op, (x,y)) - @functools.lru_cache(maxsize=None) - def contiguous_op(x): return x if x.st.contiguous else LazyBuffer(x.shape, LoadOps, LazyOp(LoadOps.CONTIGUOUS, (x,))) + @functools.lru_cache(maxsize=None if CACHE_LAZYBUFFERS else 0) + def contiguous_op(x) -> LazyBuffer: return x if x.st.contiguous else LazyBuffer(x.shape, LoadOps, LazyOp(LoadOps.CONTIGUOUS, (x,))) - @functools.lru_cache(maxsize=None) + @functools.lru_cache(maxsize=None if CACHE_LAZYBUFFERS else 0) def movement_op(x, op:MovementOps, arg) -> LazyBuffer: if SHUFFLE_MOVEMENT_OPS and x.optype == BinaryOps: # if this MovementOp is being applied to a BinaryOp, apply the MovementOp to all the BinaryOp inputs instead @@ -126,7 +127,7 @@ def ast(x: Union[LazyBuffer, LazyOp], lazy_srcs: Dict[LazyBuffer, str]) -> str: return ast_op(x.op, [ast(src, lazy_srcs) for src in x.src]) # this is needed to reduce convs from 186 -> 174 -@functools.lru_cache(maxsize=None) +@functools.lru_cache(maxsize=None if CACHE_LAZYBUFFERS else 0) def elementwise_op(op, srcs:Tuple[LazyBuffer]) -> LazyBuffer: out_shape = srcs[0].shape diff --git a/examples/benchmark_train_efficientnet.py b/examples/benchmark_train_efficientnet.py new file mode 100644 index 0000000000..e5acf78da1 --- /dev/null +++ b/examples/benchmark_train_efficientnet.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python3 +import os +import time +from tqdm import trange +from extra.utils import get_parameters +from models.efficientnet import EfficientNet +import tinygrad.optim as optim +from tinygrad.tensor import Tensor + +from test.test_gc import tensors_allocated + +import pynvml +pynvml.nvmlInit() +handle = pynvml.nvmlDeviceGetHandleByIndex(0) + +NUM = int(os.getenv("NUM", 2)) +BS = int(os.getenv("BS", 8)) +CNT = int(os.getenv("CNT", 10)) + +if __name__ == "__main__": + print(f"NUM:{NUM} BS:{BS} CNT:{CNT}") + model = EfficientNet(NUM, classes=1000, has_se=False) + parameters = get_parameters(model) + optimizer = optim.Adam(parameters, lr=0.001) + + Tensor.training = True + for i in trange(CNT): + x_train = Tensor.randn(BS, 3, 224, 224, requires_grad=False) + y_train = Tensor.randn(BS, 1000, requires_grad=False) + + st = time.monotonic() + out = model.forward(x_train) + loss = out.logsoftmax().mul(y_train).mean() + optimizer.zero_grad() + loss.backward() + et = time.monotonic() + + loss = loss.cpu().data[0] + info = pynvml.nvmlDeviceGetMemoryInfo(handle) + print(f"{(et-st)*1000.0:7.2f} ms, {loss:7.2f} loss, {tensors_allocated():4d} tensors, {info.used/1e9:.2f} GB used") + + + + + + +