mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-06-13 00:15:35 +08:00
CACHE_LAZYBUFFERS options + benchmark. only a couple x from torch
This commit is contained in:
@@ -18,6 +18,7 @@ REMOVE_MOVEMENT_NOPS = True
|
||||
MERGE_ELEMENTWISE_OPS = True
|
||||
MERGE_ELEMENTWISE_INTO_CONV_OUTPUT = True
|
||||
FOLD_CONSTANTS_INTO_KERNELS = True
|
||||
CACHE_LAZYBUFFERS = False # this leaks tons of memory. TODO: only cache unresolved LazyBuffers
|
||||
|
||||
class LazyOp(NamedTuple):
|
||||
op: Op
|
||||
@@ -84,10 +85,10 @@ class LazyBuffer:
|
||||
def unary_op(x, op): return elementwise_op(op, (x,))
|
||||
def binary_op(x, op, y:LazyBuffer): return elementwise_op(op, (x,y))
|
||||
|
||||
@functools.lru_cache(maxsize=None)
|
||||
def contiguous_op(x): return x if x.st.contiguous else LazyBuffer(x.shape, LoadOps, LazyOp(LoadOps.CONTIGUOUS, (x,)))
|
||||
@functools.lru_cache(maxsize=None if CACHE_LAZYBUFFERS else 0)
|
||||
def contiguous_op(x) -> LazyBuffer: return x if x.st.contiguous else LazyBuffer(x.shape, LoadOps, LazyOp(LoadOps.CONTIGUOUS, (x,)))
|
||||
|
||||
@functools.lru_cache(maxsize=None)
|
||||
@functools.lru_cache(maxsize=None if CACHE_LAZYBUFFERS else 0)
|
||||
def movement_op(x, op:MovementOps, arg) -> LazyBuffer:
|
||||
if SHUFFLE_MOVEMENT_OPS and x.optype == BinaryOps:
|
||||
# if this MovementOp is being applied to a BinaryOp, apply the MovementOp to all the BinaryOp inputs instead
|
||||
@@ -126,7 +127,7 @@ def ast(x: Union[LazyBuffer, LazyOp], lazy_srcs: Dict[LazyBuffer, str]) -> str:
|
||||
return ast_op(x.op, [ast(src, lazy_srcs) for src in x.src])
|
||||
|
||||
# this is needed to reduce convs from 186 -> 174
|
||||
@functools.lru_cache(maxsize=None)
|
||||
@functools.lru_cache(maxsize=None if CACHE_LAZYBUFFERS else 0)
|
||||
def elementwise_op(op, srcs:Tuple[LazyBuffer]) -> LazyBuffer:
|
||||
out_shape = srcs[0].shape
|
||||
|
||||
|
||||
47
examples/benchmark_train_efficientnet.py
Normal file
47
examples/benchmark_train_efficientnet.py
Normal file
@@ -0,0 +1,47 @@
|
||||
#!/usr/bin/env python3
|
||||
import os
|
||||
import time
|
||||
from tqdm import trange
|
||||
from extra.utils import get_parameters
|
||||
from models.efficientnet import EfficientNet
|
||||
import tinygrad.optim as optim
|
||||
from tinygrad.tensor import Tensor
|
||||
|
||||
from test.test_gc import tensors_allocated
|
||||
|
||||
import pynvml
|
||||
pynvml.nvmlInit()
|
||||
handle = pynvml.nvmlDeviceGetHandleByIndex(0)
|
||||
|
||||
NUM = int(os.getenv("NUM", 2))
|
||||
BS = int(os.getenv("BS", 8))
|
||||
CNT = int(os.getenv("CNT", 10))
|
||||
|
||||
if __name__ == "__main__":
|
||||
print(f"NUM:{NUM} BS:{BS} CNT:{CNT}")
|
||||
model = EfficientNet(NUM, classes=1000, has_se=False)
|
||||
parameters = get_parameters(model)
|
||||
optimizer = optim.Adam(parameters, lr=0.001)
|
||||
|
||||
Tensor.training = True
|
||||
for i in trange(CNT):
|
||||
x_train = Tensor.randn(BS, 3, 224, 224, requires_grad=False)
|
||||
y_train = Tensor.randn(BS, 1000, requires_grad=False)
|
||||
|
||||
st = time.monotonic()
|
||||
out = model.forward(x_train)
|
||||
loss = out.logsoftmax().mul(y_train).mean()
|
||||
optimizer.zero_grad()
|
||||
loss.backward()
|
||||
et = time.monotonic()
|
||||
|
||||
loss = loss.cpu().data[0]
|
||||
info = pynvml.nvmlDeviceGetMemoryInfo(handle)
|
||||
print(f"{(et-st)*1000.0:7.2f} ms, {loss:7.2f} loss, {tensors_allocated():4d} tensors, {info.used/1e9:.2f} GB used")
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user