not worth the speed (#1584)

* not worth the speed * no slots * uops comments * bump to python 3.11 for speed * add critical slots back
2026-06-11 23:46:02 +08:00 · 2023-08-20 10:24:58 -07:00
parent 739f327d2d
commit 012ee7d162
8 changed files with 24 additions and 180 deletions
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -14,10 +14,10 @@ jobs:
    steps:
    - name: Checkout Code
      uses: actions/checkout@v3
-    - name: Set up Python 3.8
+    - name: Set up Python 3.11
      uses: actions/setup-python@v4
      with:
-        python-version: 3.8
+        python-version: 3.11
    - name: Cache pip
      uses: actions/cache@v3
      with:
@@ -48,10 +48,10 @@ jobs:
    steps:
    - name: Checkout Code
      uses: actions/checkout@v3
-    - name: Set up Python 3.8
+    - name: Set up Python 3.11
      uses: actions/setup-python@v4
      with:
-        python-version: 3.8
+        python-version: 3.11
    - name: Cache pip
      uses: actions/cache@v3
      with:
@@ -84,10 +84,10 @@ jobs:
    steps:
    - name: Checkout Code
      uses: actions/checkout@v3
-    - name: Set up Python 3.8
+    - name: Set up Python 3.11
      uses: actions/setup-python@v4
      with:
-        python-version: 3.8
+        python-version: 3.11
    - name: Cache pip
      uses: actions/cache@v3
      with:
@@ -121,10 +121,10 @@ jobs:
          echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list
          sudo apt update
          sudo apt install -y --no-install-recommends intel-oneapi-runtime-compilers intel-oneapi-runtime-opencl
-      - name: Set up Python 3.8
+      - name: Set up Python 3.11
        uses: actions/setup-python@v4
        with:
-          python-version: 3.8
+          python-version: 3.11
      - name: Cache pip
        uses: actions/cache@v3
        with:
@@ -209,10 +209,10 @@ jobs:
    steps:
      - name: Checkout Code
        uses: actions/checkout@v3
-      - name: Set up Python 3.8
+      - name: Set up Python 3.11
        uses: actions/setup-python@v4
        with:
-          python-version: 3.8
+          python-version: 3.11
      - name: Cache pip
        uses: actions/cache@v3
        with:
@@ -279,10 +279,10 @@ jobs:
    steps:
      - name: Checkout Code
        uses: actions/checkout@v3
-      - name: Set up Python 3.8
+      - name: Set up Python 3.11
        uses: actions/setup-python@v4
        with:
-          python-version: 3.8
+          python-version: 3.11
      - name: Cache pip
        uses: actions/cache@v3
        with:
--- a/test/unit/test_weak.py
+++ b/test/unit/test_weak.py
@@ -1,103 +0,0 @@
-from tinygrad.helpers import LightWeakSet, LightWeakValueDictionary
-import unittest
-import time
-
-CNT = 1000
-
-cnt = 0
-class MyObject:
-  def __init__(self):
-    global cnt
-    self.cnt = cnt
-    cnt += 1
-    #print(f"object {self.cnt} created")
-  #def __del__(self): print(f"object {self.cnt} destroyed")
-
-class TestWeak(unittest.TestCase):
-  def test_set_drops(self):
-    ss = LightWeakSet()
-    ss.add(MyObject())
-    assert len(ss) == 0
-
-  def test_set_holds(self):
-    ss = LightWeakSet()
-    obj = MyObject()
-    ss.add(obj)
-    assert len(ss) == 1
-
-  def test_set_late_drops(self):
-    ss = LightWeakSet()
-    obj = MyObject()
-    ss.add(obj)
-    assert len(ss) == 1
-    del obj
-    assert len(ss) == 0
-
-  def test_dict_drops(self):
-    dd = LightWeakValueDictionary()
-    dd[0] = MyObject()
-    assert 0 not in dd
-
-  def test_dict_holds(self):
-    dd = LightWeakValueDictionary()
-    dd[0] = ret = MyObject()
-    assert 0 in dd
-
-  def test_a_myobj_microbench(self):
-    for _ in range(3):
-      st = time.perf_counter_ns()
-      for _ in range(CNT):
-        obj = MyObject()
-      et = (time.perf_counter_ns() - st)/CNT
-      print(f"{et:.2f} ns to create MyObject")
-
-  def test_set_add_microbench(self):
-    for _ in range(3):
-      ss = LightWeakSet()
-      st = time.perf_counter_ns()
-      for _ in range(CNT):
-        obj = MyObject()
-        ss.add(obj)
-      assert len(ss) == 1
-      et = (time.perf_counter_ns() - st)/CNT
-      print(f"{et:.2f} ns to add to LightWeakSet")
-
-  def test_set_del_microbench(self):
-    for _ in range(3):
-      ss = LightWeakSet()
-      st = time.perf_counter_ns()
-      for _ in range(CNT):
-        obj = MyObject()
-        ss.add(obj)
-        ss.discard(obj)
-      assert len(ss) == 0
-      et = (time.perf_counter_ns() - st)/CNT
-      print(f"{et:.2f} ns to add/del from LightWeakSet")
-
-  def test_dict_add_microbench(self):
-    for _ in range(3):
-      dd = LightWeakValueDictionary()
-      st = time.perf_counter_ns()
-      for i in range(CNT):
-        obj = MyObject()
-        dd[i] = obj
-      assert len(dd) == 1
-      et = (time.perf_counter_ns() - st)/CNT
-      print(f"{et:.2f} ns to add to LightWeakDict")
-
-  def test_dict_check_microbench(self):
-    for _ in range(3):
-      dd = LightWeakValueDictionary()
-      st = time.perf_counter_ns()
-      for i in range(CNT):
-        obj = MyObject()
-        dd[i] = obj
-        assert i in dd
-        tst = dd[i]
-        del obj,tst
-      assert len(dd) == 0
-      et = (time.perf_counter_ns() - st)/CNT
-      print(f"{et:.2f} ns to add/del from LightWeakDict")
-
-if __name__ == '__main__':
-  unittest.main()
--- a/tinygrad/codegen/assembly.py
+++ b/tinygrad/codegen/assembly.py
@@ -184,8 +184,6 @@ def uops_to_asmstyle(lang, function_name:str, uops:List[UOp]):
    elif uop == UOps.STORE:
      idx, treg, off = lang.addr_w_offset(args)
      lang.ins.append(AssemblyInstruction(UOps.STORE, None, [idx, lang.tor[vin[0]]] + ([treg] if treg is not None else []), (off, 'global' if not args.local else 'shared', args.memory_dtype if args.memory_dtype != dtypes.float else None)))
-  # define registers
-  lang.ins = [AssemblyInstruction(UOps.DEFINE_REGISTER, None, [], (dtype, lang.type_to_letter(dtype), c)) for dtype,c in lang.cnts.items()] + lang.ins

  if DEBUG >= 4:
    for tins in lang.ins: print(tins)
--- a/tinygrad/codegen/linearizer.py
+++ b/tinygrad/codegen/linearizer.py
@@ -13,8 +13,14 @@ from tinygrad.shape.symbolic import Variable, NumNode, Node, SumNode, MulNode, s
 VariableOrNum = Union[Variable, NumNode, Node]

 # bottom ones are asm only
-class UOps(Enum): LOOP = auto(); DEFINE_LOCAL = auto(); DEFINE_GLOBAL = auto(); LOAD = auto(); ALU = auto(); ENDLOOP = auto(); STORE = auto(); CAST = auto(); BARRIER = auto(); WMMA = auto(); \
-                  SPECIAL = auto(); DEFINE_REGISTER = auto(); LABEL = auto(); COND_BRANCH = auto() # noqa: E702
+class UOps(Enum):
+  LOOP = auto(); ENDLOOP = auto() # loops can be global, local, or other # noqa: E702
+  DEFINE_GLOBAL = auto(); DEFINE_LOCAL = auto() # this defines buffers # noqa: E702
+  LOAD = auto(); STORE = auto(); BARRIER = auto() # noqa: E702
+  ALU = auto(); WMMA = auto(); CAST = auto() # noqa: E702
+  # TODO: add CONST. use ALU WHERE for gated load
+  # *** assembly only UOps ***
+  SPECIAL = auto(); LABEL = auto(); COND_BRANCH = auto() # TODO: replace these with LOOP and ENDLOOP # noqa: E702

 def to_image_idx(base_shape:Tuple[int, ...], idxy:Node, valid:Node, validhacks=False) -> Tuple[Node, Node]:
  idy = (idxy//(4*base_shape[1]))
--- a/tinygrad/helpers.py
+++ b/tinygrad/helpers.py
@@ -1,7 +1,5 @@
 from __future__ import annotations
 import os, functools, platform, time, re, contextlib
-from weakref import KeyedRef, ref
-from _weakref import _remove_dead_weakref # type: ignore
 import numpy as np
 from typing import Dict, Tuple, Union, List, NamedTuple, Final, Iterator, ClassVar, Optional, Callable, Any, Iterable
 from math import prod # noqa: F401 # pylint:disable=unused-import
@@ -42,7 +40,6 @@ class Context(contextlib.ContextDecorator):

 class ContextVar:
  _cache: ClassVar[Dict[str, ContextVar]] = {}
-  __slots__ = "value"
  value: int
  def __new__(cls, key, default_value):
    if key in ContextVar._cache: return ContextVar._cache[key]
@@ -134,37 +131,3 @@ class GlobalCounters:
  cache: ClassVar[Optional[List[Tuple[Callable, Any, Dict[Any, int]]]]] = None  # List[Tuple[Callable, List[RawBuffer], Dict[Variable, int]]]
  @staticmethod
  def reset(): GlobalCounters.global_ops, GlobalCounters.global_mem, GlobalCounters.time_sum_s, GlobalCounters.kernel_count, GlobalCounters.cache = 0,0,0.0,0,None
-
-# Stripped down version of a WeakSet
-class LightWeakSet:
-  __slots__ = 'data', '_remove', '__weakref__'
-  def __init__(self):
-    self.data = set()
-    def _remove(item, selfref=ref(self)):
-      self = selfref()
-      if self: self.data.discard(item)
-    self._remove = _remove
-
-  def __len__(self): return len(self.data)
-  def add(self, item): self.data.add(ref(item, self._remove))
-  def discard(self, item): self.data.discard(ref(item))
-
-# Stripped down version of a WeakValueDictionary
-class LightWeakValueDictionary:
-  __slots__ = 'data', '_remove', '__weakref__'
-  def __init__(self):
-    def remove(wr, selfref=ref(self), _atomic_removal=_remove_dead_weakref):
-      self = selfref()
-      if self: _atomic_removal(self.data, wr.key)
-    self._remove = remove
-    self.data = {}
-
-  def __getitem__(self, key):
-    o = self.data[key]()
-    if o is None: raise KeyError(key)
-    else: return o
-
-  def __len__(self): return len(self.data)
-  def __delitem__(self, key): del self.data[key]
-  def __setitem__(self, key, value): self.data[key] = KeyedRef(value, self._remove, key)
-  def __contains__(self, key): return key in self.data
--- a/tinygrad/lazy.py
+++ b/tinygrad/lazy.py
@@ -2,10 +2,10 @@ from __future__ import annotations
 import operator, math
 from typing import Callable, Optional, Tuple, Union, List, Dict, Any, cast
 import sys, importlib, inspect, functools, pathlib
-from weakref import ref
+from weakref import ref, WeakSet, WeakValueDictionary

 import numpy as np
-from tinygrad.helpers import GRAPH, DEBUG, prod, getenv, DType, dtypes, flatten, ImageDType, LightWeakSet, LightWeakValueDictionary
+from tinygrad.helpers import GRAPH, DEBUG, prod, getenv, DType, dtypes, flatten, ImageDType
 from tinygrad.runtime.ops_cpu import RawNumpyBuffer
 from tinygrad.runtime.ops_disk import RawDiskBuffer
 from tinygrad.shape.shapetracker import MovementOps, ShapeTracker, View, get_contraction
@@ -94,7 +94,7 @@ def get_single_root(root:LazyBuffer) -> LazyBuffer: return get_single_root(cast(
 def get_movementroot(root:LazyBuffer, allow_contiguous=False) -> LazyBuffer: return get_movementroot(cast(LazyBuffer, root.op.src[0]), allow_contiguous) if not root.realized and (root.optype == MovementOps or (root.op.op == LoadOps.CONTIGUOUS and allow_contiguous and root.op.src[0].st.contiguous)) else root
 def get_movementroot_contiguous(x:LazyBuffer) -> LazyBuffer: return get_movementroot_contiguous(cast(LazyBuffer, x.op.src[0])) if not x.realized and x.op.op == LoadOps.CONTIGUOUS else (get_movementroot(x, True) if x.optype == MovementOps and x.st.contiguous else x)

-lazycache: LightWeakValueDictionary = LightWeakValueDictionary()
+lazycache: WeakValueDictionary = WeakValueDictionary()
 def create_lazybuffer(device:str, st:ShapeTracker, optype:OpType, op:LazyOp, dtype:DType):
  # fromcpu aren't cached
  if not LAZYCACHE or (optype is LoadOps and op.op in {LoadOps.EMPTY, LoadOps.RAND, LoadOps.CONST}): return LazyBuffer(device, st, optype, op, dtype)
@@ -109,7 +109,6 @@ def create_lazybuffer(device:str, st:ShapeTracker, optype:OpType, op:LazyOp, dty
  return ret

 class LazyBuffer:
-  __slots__ = 'st', 'device', 'shape', 'optype', 'dtype', 'op', 'realized', 'output_buffer', 'children', 'node_id', '__weakref__'
  __deletable__ = ('op',)
  def __init__(self, device:str, st:ShapeTracker, optype:OpType, op:LazyOp, dtype:DType, src:Optional[RawBuffer]=None):
    self.st: ShapeTracker = st  # NOTE: this is not a copy! this should be a "read-only" ShapeTracker
@@ -117,7 +116,7 @@ class LazyBuffer:
    self.realized: Optional[RawBuffer] = src
    self.output_buffer: Optional[RawBuffer] = None   # TODO: do we really need this? or can we just use realized
    # TODO: does children have to be a ref count instead of a set? can a Buffer be a double child?
-    self.children: LightWeakSet = LightWeakSet()
+    self.children: WeakSet = WeakSet()
    # NOTE: op should be read only after construction of LazyBuffer
    self.op: LazyOp = op
    for x in op.buffers: x.children.add(self)
--- a/tinygrad/mlops.py
+++ b/tinygrad/mlops.py
@@ -10,7 +10,6 @@ class Contiguous(Function):
  def backward(self, grad_output): return grad_output

 class Cast(Function):
-  __slots__ = "input_dtype", "bitcast"
  def forward(self, x:LazyBuffer, dtype:DType, bitcast=False):
    self.input_dtype, self.bitcast = x.dtype, bitcast
    return x.cast((dtype, bitcast))
@@ -20,7 +19,6 @@ class Cast(Function):
 # ************* unary ops *************

 class Sin(Function):
-  __slots__ = "x"
  def forward(self, x:LazyBuffer) -> LazyBuffer:
    self.x = x
    return x.unary_op(UnaryOps.SIN)
@@ -29,7 +27,6 @@ class Sin(Function):

 # NOTE: maximum(x, 0) behaves differently where x=0
 class Relu(Function):
-  __slots__ = "ret"
  def forward(self, x:LazyBuffer) -> LazyBuffer:
    self.ret = x.binary_op(BinaryOps.MAX, 0)
    return self.ret
@@ -38,7 +35,6 @@ class Relu(Function):
    return (0 < self.ret) * grad_output

 class Log(Function):
-  __slots__ = "x"
  def forward(self, x:LazyBuffer) -> LazyBuffer:
    self.x = x
    return x.unary_op(UnaryOps.LOG2) * math.log(2)
@@ -47,7 +43,6 @@ class Log(Function):
    return grad_output / self.x

 class Exp(Function):
-  __slots__ = "ret"
  def forward(self, x:LazyBuffer) -> LazyBuffer:
    self.ret = (x * (1/math.log(2))).unary_op(UnaryOps.EXP2)
    return self.ret
@@ -56,7 +51,6 @@ class Exp(Function):
    return self.ret * grad_output

 class Sqrt(Function):
-  __slots__ = "ret"
  def forward(self, x:LazyBuffer) -> LazyBuffer:
    self.ret = x.unary_op(UnaryOps.SQRT)
    return self.ret
@@ -68,7 +62,6 @@ class Sqrt(Function):
 # https://towardsdatascience.com/derivative-of-the-sigmoid-function-536880cf918e
 # TODO: have the backend automatically find this
 class Sigmoid(Function):
-  __slots__ = "ret"
  def forward(self, x:LazyBuffer) -> LazyBuffer:
    self.ret = 1 / (1 + (x * (-1/math.log(2))).unary_op(UnaryOps.EXP2))
    return self.ret
@@ -79,7 +72,6 @@ class Sigmoid(Function):
 # ************* reduce ops *************

 class Sum(Function):
-  __slots__ = "input_shape"
  def forward(self, x:LazyBuffer, new_shape:ShapeType) -> LazyBuffer:
    self.input_shape = x.shape
    return x.reduce_op(ReduceOps.SUM, new_shape)
@@ -88,7 +80,6 @@ class Sum(Function):
    return grad_output.expand(self.input_shape)

 class Max(Function):
-  __slots__ = "x", "ret"
  def forward(self, x:LazyBuffer, new_shape:ShapeType) -> LazyBuffer:
    self.x, self.ret = x, x.reduce_op(ReduceOps.MAX, new_shape)
    return self.ret
@@ -122,7 +113,6 @@ class Sub(Function):
           -grad_output if self.needs_input_grad[1] else None

 class Mul(Function):
-  __slots__ = 'x', 'y'
  def forward(self, x:LazyBuffer, y:LazyBuffer) -> LazyBuffer:
    self.x, self.y = x, y
    return x * y
@@ -132,7 +122,6 @@ class Mul(Function):
           self.x * grad_output if self.needs_input_grad[1] else None

 class Div(Function):
-  __slots__ = 'x', 'y'
  def forward(self, x:LazyBuffer, y:LazyBuffer) -> LazyBuffer:
    self.x, self.y = x, y
    return x / y
@@ -144,7 +133,6 @@ class Div(Function):
 # ************* ternary ops *************

 class Where(Function):
-  __slots__ = "x"
  def forward(self, x:LazyBuffer, y:LazyBuffer, z:LazyBuffer) -> LazyBuffer:
    self.x = x
    return x.ternary_op(TernaryOps.WHERE, y, z)
@@ -158,7 +146,6 @@ class Where(Function):

 # NOTE: this is sum in reverse
 class Expand(Function):
-  __slots__ = 'input_shape'
  def forward(self, x:LazyBuffer, shape:ShapeType) -> LazyBuffer:
    self.input_shape = x.shape
    return x.expand(shape)
@@ -167,7 +154,6 @@ class Expand(Function):
    return grad_output.reduce_op(ReduceOps.SUM, self.input_shape)

 class Reshape(Function):
-  __slots__ = 'input_shape'
  def forward(self, x:LazyBuffer, shape:ShapeType) -> LazyBuffer:
    self.input_shape = x.shape
    return x.reshape(shape)
@@ -176,7 +162,6 @@ class Reshape(Function):
    return grad_output.reshape(self.input_shape)

 class Permute(Function):
-  __slots__ = 'input_order'
  def forward(self, x:LazyBuffer, order:Tuple[int, ...]) -> LazyBuffer:
    self.input_order = order
    return x.permute(order)
@@ -185,7 +170,6 @@ class Permute(Function):
    return grad_output.permute(argsort(self.input_order))

 class Pad(Function):
-  __slots__ = 'narg'
  def forward(self, x:LazyBuffer, arg:Tuple[Tuple[int, int], ...]) -> LazyBuffer:
    self.narg = tuple([(p[0], s+p[0]) for s,p in zip(x.shape, arg)])
    return x.pad(arg)
@@ -194,7 +178,6 @@ class Pad(Function):
    return grad_output.shrink(self.narg)

 class Shrink(Function):
-  __slots__ = 'narg'
  def forward(self, x:LazyBuffer, arg:Tuple[Tuple[int, int], ...]) -> LazyBuffer:
    self.narg = tuple([(p[0], s-p[1]) for s,p in zip(x.shape, arg)])
    return x.shrink(arg)
@@ -203,7 +186,6 @@ class Shrink(Function):
    return grad_output.pad(self.narg)

 class Flip(Function):
-  __slots__ = 'arg'
  def forward(self, x:LazyBuffer, axis:Tuple[int, ...]):
    self.arg = tuple([-1 if i in set(axis) else 1 for i in range(len(x.shape))])
    return x.stride(self.arg)
--- a/tinygrad/ops.py
+++ b/tinygrad/ops.py
@@ -23,7 +23,6 @@ Op = Union[UnaryOps, BinaryOps, ReduceOps, MovementOps, LoadOps, TernaryOps]
 OpType = Union[Type[UnaryOps], Type[BinaryOps], Type[ReduceOps], Type[MovementOps], Type[LoadOps], Type[TernaryOps]]

 class LazyOp:
-  # TODO: add dest to support multiple outputs. on second thought, multiple outputs will have multiple LazyOps.
  __slots__ = "op", "src", "arg", "buffers", "__weakref__"
  op: Op
  src: Tuple[Union[LazyOp, LazyBuffer], ...]