not worth the speed (#1584)

* not worth the speed

* no slots

* uops comments

* bump to python 3.11 for speed

* add critical slots back
This commit is contained in:
George Hotz
2023-08-20 10:24:58 -07:00
committed by GitHub
parent 739f327d2d
commit 012ee7d162
8 changed files with 24 additions and 180 deletions

View File

@@ -14,10 +14,10 @@ jobs:
steps:
- name: Checkout Code
uses: actions/checkout@v3
- name: Set up Python 3.8
- name: Set up Python 3.11
uses: actions/setup-python@v4
with:
python-version: 3.8
python-version: 3.11
- name: Cache pip
uses: actions/cache@v3
with:
@@ -48,10 +48,10 @@ jobs:
steps:
- name: Checkout Code
uses: actions/checkout@v3
- name: Set up Python 3.8
- name: Set up Python 3.11
uses: actions/setup-python@v4
with:
python-version: 3.8
python-version: 3.11
- name: Cache pip
uses: actions/cache@v3
with:
@@ -84,10 +84,10 @@ jobs:
steps:
- name: Checkout Code
uses: actions/checkout@v3
- name: Set up Python 3.8
- name: Set up Python 3.11
uses: actions/setup-python@v4
with:
python-version: 3.8
python-version: 3.11
- name: Cache pip
uses: actions/cache@v3
with:
@@ -121,10 +121,10 @@ jobs:
echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list
sudo apt update
sudo apt install -y --no-install-recommends intel-oneapi-runtime-compilers intel-oneapi-runtime-opencl
- name: Set up Python 3.8
- name: Set up Python 3.11
uses: actions/setup-python@v4
with:
python-version: 3.8
python-version: 3.11
- name: Cache pip
uses: actions/cache@v3
with:
@@ -209,10 +209,10 @@ jobs:
steps:
- name: Checkout Code
uses: actions/checkout@v3
- name: Set up Python 3.8
- name: Set up Python 3.11
uses: actions/setup-python@v4
with:
python-version: 3.8
python-version: 3.11
- name: Cache pip
uses: actions/cache@v3
with:
@@ -279,10 +279,10 @@ jobs:
steps:
- name: Checkout Code
uses: actions/checkout@v3
- name: Set up Python 3.8
- name: Set up Python 3.11
uses: actions/setup-python@v4
with:
python-version: 3.8
python-version: 3.11
- name: Cache pip
uses: actions/cache@v3
with:

View File

@@ -1,103 +0,0 @@
from tinygrad.helpers import LightWeakSet, LightWeakValueDictionary
import unittest
import time
CNT = 1000
cnt = 0
class MyObject:
def __init__(self):
global cnt
self.cnt = cnt
cnt += 1
#print(f"object {self.cnt} created")
#def __del__(self): print(f"object {self.cnt} destroyed")
class TestWeak(unittest.TestCase):
def test_set_drops(self):
ss = LightWeakSet()
ss.add(MyObject())
assert len(ss) == 0
def test_set_holds(self):
ss = LightWeakSet()
obj = MyObject()
ss.add(obj)
assert len(ss) == 1
def test_set_late_drops(self):
ss = LightWeakSet()
obj = MyObject()
ss.add(obj)
assert len(ss) == 1
del obj
assert len(ss) == 0
def test_dict_drops(self):
dd = LightWeakValueDictionary()
dd[0] = MyObject()
assert 0 not in dd
def test_dict_holds(self):
dd = LightWeakValueDictionary()
dd[0] = ret = MyObject()
assert 0 in dd
def test_a_myobj_microbench(self):
for _ in range(3):
st = time.perf_counter_ns()
for _ in range(CNT):
obj = MyObject()
et = (time.perf_counter_ns() - st)/CNT
print(f"{et:.2f} ns to create MyObject")
def test_set_add_microbench(self):
for _ in range(3):
ss = LightWeakSet()
st = time.perf_counter_ns()
for _ in range(CNT):
obj = MyObject()
ss.add(obj)
assert len(ss) == 1
et = (time.perf_counter_ns() - st)/CNT
print(f"{et:.2f} ns to add to LightWeakSet")
def test_set_del_microbench(self):
for _ in range(3):
ss = LightWeakSet()
st = time.perf_counter_ns()
for _ in range(CNT):
obj = MyObject()
ss.add(obj)
ss.discard(obj)
assert len(ss) == 0
et = (time.perf_counter_ns() - st)/CNT
print(f"{et:.2f} ns to add/del from LightWeakSet")
def test_dict_add_microbench(self):
for _ in range(3):
dd = LightWeakValueDictionary()
st = time.perf_counter_ns()
for i in range(CNT):
obj = MyObject()
dd[i] = obj
assert len(dd) == 1
et = (time.perf_counter_ns() - st)/CNT
print(f"{et:.2f} ns to add to LightWeakDict")
def test_dict_check_microbench(self):
for _ in range(3):
dd = LightWeakValueDictionary()
st = time.perf_counter_ns()
for i in range(CNT):
obj = MyObject()
dd[i] = obj
assert i in dd
tst = dd[i]
del obj,tst
assert len(dd) == 0
et = (time.perf_counter_ns() - st)/CNT
print(f"{et:.2f} ns to add/del from LightWeakDict")
if __name__ == '__main__':
unittest.main()

View File

@@ -184,8 +184,6 @@ def uops_to_asmstyle(lang, function_name:str, uops:List[UOp]):
elif uop == UOps.STORE:
idx, treg, off = lang.addr_w_offset(args)
lang.ins.append(AssemblyInstruction(UOps.STORE, None, [idx, lang.tor[vin[0]]] + ([treg] if treg is not None else []), (off, 'global' if not args.local else 'shared', args.memory_dtype if args.memory_dtype != dtypes.float else None)))
# define registers
lang.ins = [AssemblyInstruction(UOps.DEFINE_REGISTER, None, [], (dtype, lang.type_to_letter(dtype), c)) for dtype,c in lang.cnts.items()] + lang.ins
if DEBUG >= 4:
for tins in lang.ins: print(tins)

View File

@@ -13,8 +13,14 @@ from tinygrad.shape.symbolic import Variable, NumNode, Node, SumNode, MulNode, s
VariableOrNum = Union[Variable, NumNode, Node]
# bottom ones are asm only
class UOps(Enum): LOOP = auto(); DEFINE_LOCAL = auto(); DEFINE_GLOBAL = auto(); LOAD = auto(); ALU = auto(); ENDLOOP = auto(); STORE = auto(); CAST = auto(); BARRIER = auto(); WMMA = auto(); \
SPECIAL = auto(); DEFINE_REGISTER = auto(); LABEL = auto(); COND_BRANCH = auto() # noqa: E702
class UOps(Enum):
LOOP = auto(); ENDLOOP = auto() # loops can be global, local, or other # noqa: E702
DEFINE_GLOBAL = auto(); DEFINE_LOCAL = auto() # this defines buffers # noqa: E702
LOAD = auto(); STORE = auto(); BARRIER = auto() # noqa: E702
ALU = auto(); WMMA = auto(); CAST = auto() # noqa: E702
# TODO: add CONST. use ALU WHERE for gated load
# *** assembly only UOps ***
SPECIAL = auto(); LABEL = auto(); COND_BRANCH = auto() # TODO: replace these with LOOP and ENDLOOP # noqa: E702
def to_image_idx(base_shape:Tuple[int, ...], idxy:Node, valid:Node, validhacks=False) -> Tuple[Node, Node]:
idy = (idxy//(4*base_shape[1]))

View File

@@ -1,7 +1,5 @@
from __future__ import annotations
import os, functools, platform, time, re, contextlib
from weakref import KeyedRef, ref
from _weakref import _remove_dead_weakref # type: ignore
import numpy as np
from typing import Dict, Tuple, Union, List, NamedTuple, Final, Iterator, ClassVar, Optional, Callable, Any, Iterable
from math import prod # noqa: F401 # pylint:disable=unused-import
@@ -42,7 +40,6 @@ class Context(contextlib.ContextDecorator):
class ContextVar:
_cache: ClassVar[Dict[str, ContextVar]] = {}
__slots__ = "value"
value: int
def __new__(cls, key, default_value):
if key in ContextVar._cache: return ContextVar._cache[key]
@@ -134,37 +131,3 @@ class GlobalCounters:
cache: ClassVar[Optional[List[Tuple[Callable, Any, Dict[Any, int]]]]] = None # List[Tuple[Callable, List[RawBuffer], Dict[Variable, int]]]
@staticmethod
def reset(): GlobalCounters.global_ops, GlobalCounters.global_mem, GlobalCounters.time_sum_s, GlobalCounters.kernel_count, GlobalCounters.cache = 0,0,0.0,0,None
# Stripped down version of a WeakSet
class LightWeakSet:
__slots__ = 'data', '_remove', '__weakref__'
def __init__(self):
self.data = set()
def _remove(item, selfref=ref(self)):
self = selfref()
if self: self.data.discard(item)
self._remove = _remove
def __len__(self): return len(self.data)
def add(self, item): self.data.add(ref(item, self._remove))
def discard(self, item): self.data.discard(ref(item))
# Stripped down version of a WeakValueDictionary
class LightWeakValueDictionary:
__slots__ = 'data', '_remove', '__weakref__'
def __init__(self):
def remove(wr, selfref=ref(self), _atomic_removal=_remove_dead_weakref):
self = selfref()
if self: _atomic_removal(self.data, wr.key)
self._remove = remove
self.data = {}
def __getitem__(self, key):
o = self.data[key]()
if o is None: raise KeyError(key)
else: return o
def __len__(self): return len(self.data)
def __delitem__(self, key): del self.data[key]
def __setitem__(self, key, value): self.data[key] = KeyedRef(value, self._remove, key)
def __contains__(self, key): return key in self.data

View File

@@ -2,10 +2,10 @@ from __future__ import annotations
import operator, math
from typing import Callable, Optional, Tuple, Union, List, Dict, Any, cast
import sys, importlib, inspect, functools, pathlib
from weakref import ref
from weakref import ref, WeakSet, WeakValueDictionary
import numpy as np
from tinygrad.helpers import GRAPH, DEBUG, prod, getenv, DType, dtypes, flatten, ImageDType, LightWeakSet, LightWeakValueDictionary
from tinygrad.helpers import GRAPH, DEBUG, prod, getenv, DType, dtypes, flatten, ImageDType
from tinygrad.runtime.ops_cpu import RawNumpyBuffer
from tinygrad.runtime.ops_disk import RawDiskBuffer
from tinygrad.shape.shapetracker import MovementOps, ShapeTracker, View, get_contraction
@@ -94,7 +94,7 @@ def get_single_root(root:LazyBuffer) -> LazyBuffer: return get_single_root(cast(
def get_movementroot(root:LazyBuffer, allow_contiguous=False) -> LazyBuffer: return get_movementroot(cast(LazyBuffer, root.op.src[0]), allow_contiguous) if not root.realized and (root.optype == MovementOps or (root.op.op == LoadOps.CONTIGUOUS and allow_contiguous and root.op.src[0].st.contiguous)) else root
def get_movementroot_contiguous(x:LazyBuffer) -> LazyBuffer: return get_movementroot_contiguous(cast(LazyBuffer, x.op.src[0])) if not x.realized and x.op.op == LoadOps.CONTIGUOUS else (get_movementroot(x, True) if x.optype == MovementOps and x.st.contiguous else x)
lazycache: LightWeakValueDictionary = LightWeakValueDictionary()
lazycache: WeakValueDictionary = WeakValueDictionary()
def create_lazybuffer(device:str, st:ShapeTracker, optype:OpType, op:LazyOp, dtype:DType):
# fromcpu aren't cached
if not LAZYCACHE or (optype is LoadOps and op.op in {LoadOps.EMPTY, LoadOps.RAND, LoadOps.CONST}): return LazyBuffer(device, st, optype, op, dtype)
@@ -109,7 +109,6 @@ def create_lazybuffer(device:str, st:ShapeTracker, optype:OpType, op:LazyOp, dty
return ret
class LazyBuffer:
__slots__ = 'st', 'device', 'shape', 'optype', 'dtype', 'op', 'realized', 'output_buffer', 'children', 'node_id', '__weakref__'
__deletable__ = ('op',)
def __init__(self, device:str, st:ShapeTracker, optype:OpType, op:LazyOp, dtype:DType, src:Optional[RawBuffer]=None):
self.st: ShapeTracker = st # NOTE: this is not a copy! this should be a "read-only" ShapeTracker
@@ -117,7 +116,7 @@ class LazyBuffer:
self.realized: Optional[RawBuffer] = src
self.output_buffer: Optional[RawBuffer] = None # TODO: do we really need this? or can we just use realized
# TODO: does children have to be a ref count instead of a set? can a Buffer be a double child?
self.children: LightWeakSet = LightWeakSet()
self.children: WeakSet = WeakSet()
# NOTE: op should be read only after construction of LazyBuffer
self.op: LazyOp = op
for x in op.buffers: x.children.add(self)

View File

@@ -10,7 +10,6 @@ class Contiguous(Function):
def backward(self, grad_output): return grad_output
class Cast(Function):
__slots__ = "input_dtype", "bitcast"
def forward(self, x:LazyBuffer, dtype:DType, bitcast=False):
self.input_dtype, self.bitcast = x.dtype, bitcast
return x.cast((dtype, bitcast))
@@ -20,7 +19,6 @@ class Cast(Function):
# ************* unary ops *************
class Sin(Function):
__slots__ = "x"
def forward(self, x:LazyBuffer) -> LazyBuffer:
self.x = x
return x.unary_op(UnaryOps.SIN)
@@ -29,7 +27,6 @@ class Sin(Function):
# NOTE: maximum(x, 0) behaves differently where x=0
class Relu(Function):
__slots__ = "ret"
def forward(self, x:LazyBuffer) -> LazyBuffer:
self.ret = x.binary_op(BinaryOps.MAX, 0)
return self.ret
@@ -38,7 +35,6 @@ class Relu(Function):
return (0 < self.ret) * grad_output
class Log(Function):
__slots__ = "x"
def forward(self, x:LazyBuffer) -> LazyBuffer:
self.x = x
return x.unary_op(UnaryOps.LOG2) * math.log(2)
@@ -47,7 +43,6 @@ class Log(Function):
return grad_output / self.x
class Exp(Function):
__slots__ = "ret"
def forward(self, x:LazyBuffer) -> LazyBuffer:
self.ret = (x * (1/math.log(2))).unary_op(UnaryOps.EXP2)
return self.ret
@@ -56,7 +51,6 @@ class Exp(Function):
return self.ret * grad_output
class Sqrt(Function):
__slots__ = "ret"
def forward(self, x:LazyBuffer) -> LazyBuffer:
self.ret = x.unary_op(UnaryOps.SQRT)
return self.ret
@@ -68,7 +62,6 @@ class Sqrt(Function):
# https://towardsdatascience.com/derivative-of-the-sigmoid-function-536880cf918e
# TODO: have the backend automatically find this
class Sigmoid(Function):
__slots__ = "ret"
def forward(self, x:LazyBuffer) -> LazyBuffer:
self.ret = 1 / (1 + (x * (-1/math.log(2))).unary_op(UnaryOps.EXP2))
return self.ret
@@ -79,7 +72,6 @@ class Sigmoid(Function):
# ************* reduce ops *************
class Sum(Function):
__slots__ = "input_shape"
def forward(self, x:LazyBuffer, new_shape:ShapeType) -> LazyBuffer:
self.input_shape = x.shape
return x.reduce_op(ReduceOps.SUM, new_shape)
@@ -88,7 +80,6 @@ class Sum(Function):
return grad_output.expand(self.input_shape)
class Max(Function):
__slots__ = "x", "ret"
def forward(self, x:LazyBuffer, new_shape:ShapeType) -> LazyBuffer:
self.x, self.ret = x, x.reduce_op(ReduceOps.MAX, new_shape)
return self.ret
@@ -122,7 +113,6 @@ class Sub(Function):
-grad_output if self.needs_input_grad[1] else None
class Mul(Function):
__slots__ = 'x', 'y'
def forward(self, x:LazyBuffer, y:LazyBuffer) -> LazyBuffer:
self.x, self.y = x, y
return x * y
@@ -132,7 +122,6 @@ class Mul(Function):
self.x * grad_output if self.needs_input_grad[1] else None
class Div(Function):
__slots__ = 'x', 'y'
def forward(self, x:LazyBuffer, y:LazyBuffer) -> LazyBuffer:
self.x, self.y = x, y
return x / y
@@ -144,7 +133,6 @@ class Div(Function):
# ************* ternary ops *************
class Where(Function):
__slots__ = "x"
def forward(self, x:LazyBuffer, y:LazyBuffer, z:LazyBuffer) -> LazyBuffer:
self.x = x
return x.ternary_op(TernaryOps.WHERE, y, z)
@@ -158,7 +146,6 @@ class Where(Function):
# NOTE: this is sum in reverse
class Expand(Function):
__slots__ = 'input_shape'
def forward(self, x:LazyBuffer, shape:ShapeType) -> LazyBuffer:
self.input_shape = x.shape
return x.expand(shape)
@@ -167,7 +154,6 @@ class Expand(Function):
return grad_output.reduce_op(ReduceOps.SUM, self.input_shape)
class Reshape(Function):
__slots__ = 'input_shape'
def forward(self, x:LazyBuffer, shape:ShapeType) -> LazyBuffer:
self.input_shape = x.shape
return x.reshape(shape)
@@ -176,7 +162,6 @@ class Reshape(Function):
return grad_output.reshape(self.input_shape)
class Permute(Function):
__slots__ = 'input_order'
def forward(self, x:LazyBuffer, order:Tuple[int, ...]) -> LazyBuffer:
self.input_order = order
return x.permute(order)
@@ -185,7 +170,6 @@ class Permute(Function):
return grad_output.permute(argsort(self.input_order))
class Pad(Function):
__slots__ = 'narg'
def forward(self, x:LazyBuffer, arg:Tuple[Tuple[int, int], ...]) -> LazyBuffer:
self.narg = tuple([(p[0], s+p[0]) for s,p in zip(x.shape, arg)])
return x.pad(arg)
@@ -194,7 +178,6 @@ class Pad(Function):
return grad_output.shrink(self.narg)
class Shrink(Function):
__slots__ = 'narg'
def forward(self, x:LazyBuffer, arg:Tuple[Tuple[int, int], ...]) -> LazyBuffer:
self.narg = tuple([(p[0], s-p[1]) for s,p in zip(x.shape, arg)])
return x.shrink(arg)
@@ -203,7 +186,6 @@ class Shrink(Function):
return grad_output.pad(self.narg)
class Flip(Function):
__slots__ = 'arg'
def forward(self, x:LazyBuffer, axis:Tuple[int, ...]):
self.arg = tuple([-1 if i in set(axis) else 1 for i in range(len(x.shape))])
return x.stride(self.arg)

View File

@@ -23,7 +23,6 @@ Op = Union[UnaryOps, BinaryOps, ReduceOps, MovementOps, LoadOps, TernaryOps]
OpType = Union[Type[UnaryOps], Type[BinaryOps], Type[ReduceOps], Type[MovementOps], Type[LoadOps], Type[TernaryOps]]
class LazyOp:
# TODO: add dest to support multiple outputs. on second thought, multiple outputs will have multiple LazyOps.
__slots__ = "op", "src", "arg", "buffers", "__weakref__"
op: Op
src: Tuple[Union[LazyOp, LazyBuffer], ...]