From 889acefe853a831d56cd5dc259da3186d4cd7666 Mon Sep 17 00:00:00 2001 From: George Hotz <72895+geohot@users.noreply.github.com> Date: Wed, 29 Nov 2023 08:30:46 -0800 Subject: [PATCH] Support weird loads in Image (#2498) * image support weird loads * umm, that was always wrong * openpilot compile fails with a weird error * image test passes * we have valids now * clean that up * no more required opts * add fastvits test, fix bug * minor cleanups --- .github/workflows/test.yml | 5 ++- openpilot/compile2.py | 10 ++---- test/test_image_dtype.py | 29 +++++++++++++++++ tinygrad/codegen/kernel.py | 26 ++++++--------- tinygrad/codegen/linearizer.py | 32 +++++++++++------- tinygrad/device.py | 3 -- tinygrad/features/image.py | 59 ++-------------------------------- tinygrad/lazy.py | 10 ++++-- tinygrad/realize.py | 6 +--- tinygrad/renderer/cstyle.py | 2 +- tinygrad/shape/symbolic.py | 2 +- 11 files changed, 79 insertions(+), 105 deletions(-) create mode 100644 test/test_image_dtype.py diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index fe3569b44c..4d2e298a50 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -182,7 +182,7 @@ jobs: - if: ${{ matrix.task == 'openpilot' }} name: Test openpilot model compile and size run: | - DEBUG=2 ALLOWED_KERNEL_COUNT=207 VALIDTEST=1 FLOAT16=1 DEBUGCL=1 GPU=1 IMAGE=2 python openpilot/compile2.py + DEBUG=2 ALLOWED_KERNEL_COUNT=207 FLOAT16=1 DEBUGCL=1 GPU=1 IMAGE=2 python openpilot/compile2.py python -c 'import os; assert os.path.getsize("/tmp/output.thneed") < 100_000_000' - if: ${{ matrix.task == 'openpilot' }} name: Test openpilot model correctness (float32) @@ -190,6 +190,9 @@ jobs: - if: ${{ matrix.task == 'openpilot' }} name: Test openpilot alt model correctness (float32) run: FLOAT16=0 DEBUGCL=1 GPU=1 IMAGE=2 python openpilot/compile2.py https://github.com/commaai/openpilot/raw/3799fe46b3a629e491d4b8498b8ae83e4c88c304/selfdrive/modeld/models/supercombo.onnx + - if: ${{ matrix.task == 'openpilot' }} + name: Test openpilot fastvits model correctness (float32) + run: FLOAT16=0 DEBUGCL=1 GPU=1 IMAGE=2 python openpilot/compile2.py https://github.com/commaai/openpilot/raw/9118973ed03c1ae1d40cf69a29507ec2cc78efd7/selfdrive/modeld/models/supercombo.onnx - if: ${{ matrix.task == 'openpilot' }} name: Test tensor core ops run: GPU=1 TC=2 python -m pytest -n=auto test/test_ops.py diff --git a/openpilot/compile2.py b/openpilot/compile2.py index f12325cd1e..036d3e4f1e 100644 --- a/openpilot/compile2.py +++ b/openpilot/compile2.py @@ -14,10 +14,9 @@ from typing import Tuple, List from extra.onnx import get_run_onnx from tinygrad.graph import print_tree, log_schedule_item from tinygrad import Tensor, Device -from tinygrad.helpers import dtypes, partition, GlobalCounters, Context, fetch, getenv, ImageDType, GRAPH +from tinygrad.helpers import dtypes, partition, GlobalCounters, Context, fetch, getenv, ImageDType, GRAPH, DEBUG from tinygrad.realize import run_schedule from tinygrad.ops import LoadOps, ScheduleItem -from tinygrad.features.image import fix_schedule_for_images Device.DEFAULT = "GPU" def get_schedule(onnx_data) -> Tuple[List[ScheduleItem], List[ScheduleItem]]: @@ -67,10 +66,6 @@ def schedule_to_thneed(schedule, output_fn): setattr(prg.clprg, 'op_estimate', prg.op_estimate) setattr(prg.clprg, 'prg', prg.prg) - if getenv("VALIDTEST") == 1: - src = re.search(r"=.*\?.*?read_image", prg.prg) - if src is not None: raise Exception("Openpilot has valid checks!") - global_size = prg.global_size + [1]*(3-len(prg.global_size)) local_size = prg.local_size + [1]*(3-len(prg.local_size)) cl_cache.append((prg.clprg, [[int(g*l) for g,l in zip(global_size, local_size)], local_size, *[x.realized._buf for x in args]])) @@ -146,8 +141,7 @@ if __name__ == "__main__": run_schedule(schedule_independent, disable_logging=True) run_schedule(schedule_input) - with Context(DEBUG=2, BEAM=getenv("LATEBEAM")): - schedule = fix_schedule_for_images(schedule) + with Context(DEBUG=max(DEBUG.value, 2), BEAM=getenv("LATEBEAM")): image_count = sum(isinstance(si.out.dtype, ImageDType) for si in schedule) print(f"**** running real kernels {image_count}/{len(schedule)} images ****") diff --git a/test/test_image_dtype.py b/test/test_image_dtype.py new file mode 100644 index 0000000000..d804234e9e --- /dev/null +++ b/test/test_image_dtype.py @@ -0,0 +1,29 @@ +import unittest +import numpy as np +from tinygrad import Device, dtypes, Tensor +from tinygrad.helpers import ImageDType + +@unittest.skipIf(Device.DEFAULT != "GPU", "only images on GPU") +class TestImageDType(unittest.TestCase): + def test_shrink_load_float(self): + it = Tensor.randn(4).cast(dtypes.imagef((1,1,4))).realize() + imgv = it.numpy() + np.testing.assert_equal(imgv[0:2], it[0:2].numpy()) + + def test_mul_stays_image(self): + it = Tensor.randn(4).cast(dtypes.imagef((1,1,4))).realize() + out = (it*2).realize() + assert isinstance(out.lazydata.realized.dtype, ImageDType) + + def test_shrink_max(self): + it = Tensor.randn(8).cast(dtypes.imagef((1,2,4))).realize() + imgv = it.numpy() + np.testing.assert_equal(np.maximum(imgv[0:3], 0), it[0:3].relu().numpy()) + + def test_shrink_to_float(self): + it = Tensor.randn(4, 4).cast(dtypes.imagef((1,4,4))).realize() + imgv = it.numpy() + np.testing.assert_equal(np.maximum(imgv[:, 0], 0), it[:, 0].relu().realize()) + +if __name__ == '__main__': + unittest.main() diff --git a/tinygrad/codegen/kernel.py b/tinygrad/codegen/kernel.py index 0ee4beffb8..3bcbc80661 100644 --- a/tinygrad/codegen/kernel.py +++ b/tinygrad/codegen/kernel.py @@ -468,21 +468,7 @@ class Kernel: assert padded, "nothing was padded" return self.simplify_ones() - def required_optimizations(self, early_only=False): - for buf_index,buf in enumerate(self.bufs): - unit_stride_axes_mul_4 = [i for i in self.sts[buf_index].unit_stride_axes(ignore_valid=True) if self.sts[buf_index].shape[i]%4 == 0] - if (not early_only or buf in self.earlybufs) and self.bufs[buf_index].dtype.__class__ is ImageDType: - assert len(unit_stride_axes_mul_4) >= 1, f"needs a unit stride axis in {self.bufs[buf_index]}" - if all(x < (self.shape_len-self.upcasted) for x in unit_stride_axes_mul_4) and unit_stride_axes_mul_4[0] not in self.upcast_in_mid_reduce_axes: - if unit_stride_axes_mul_4[0] < self.first_reduce: - self.apply_opt(Opt(OptOps.UPCAST, unit_stride_axes_mul_4[0], 4)) - else: - self.apply_opt(Opt(OptOps.UNROLL, unit_stride_axes_mul_4[0]-self.first_reduce, 4)) - def hand_coded_optimizations(self): - # if there's images in the earlybufs, we have to make an axis the 4 loading one - self.required_optimizations(early_only=True) - # should use matvec - TODO: adjust/tune based on the wide vs tall/large vs small mat MV_BLOCKSIZE, MV_THREADS_PER_ROW, MV_ROWS_PER_THREAD = getenv("MV_BLOCKSIZE", 4), getenv("MV_THREADS_PER_ROW", 8), getenv("MV_ROWS_PER_THREAD", 4) if self.opts.has_local and getenv("MV",1) != 0 and (MV_BLOCKSIZE > 1 or MV_THREADS_PER_ROW > 1 or MV_ROWS_PER_THREAD > 1) and \ @@ -522,8 +508,16 @@ class Kernel: if self.sts[0].shape[axes[0]]%4 == 0: self.apply_opt(Opt(OptOps.UPCASTMID, axes[0], 4)) - # now do everything required - self.required_optimizations() + # upcast float4 images + for buf_index,buf in enumerate(self.bufs): + unit_stride_axes_mul_4 = [i for i in self.sts[buf_index].unit_stride_axes(ignore_valid=True) if self.sts[buf_index].shape[i]%4 == 0] + if buf.dtype.__class__ is ImageDType: + #assert len(unit_stride_axes_mul_4) >= 1, f"needs a unit stride axis in {self.bufs[buf_index]}" + if len(unit_stride_axes_mul_4) and all(x < (self.shape_len-self.upcasted) for x in unit_stride_axes_mul_4) and unit_stride_axes_mul_4[0] not in self.upcast_in_mid_reduce_axes: + if unit_stride_axes_mul_4[0] < self.first_reduce: + self.apply_opt(Opt(OptOps.UPCAST, unit_stride_axes_mul_4[0], 4)) + else: + self.apply_opt(Opt(OptOps.UNROLL, unit_stride_axes_mul_4[0]-self.first_reduce, 4)) # no more opt if we are grouping if self.group_for_reduce: return diff --git a/tinygrad/codegen/linearizer.py b/tinygrad/codegen/linearizer.py index 78787e40a3..38070fe12b 100644 --- a/tinygrad/codegen/linearizer.py +++ b/tinygrad/codegen/linearizer.py @@ -92,20 +92,28 @@ class Linearizer(Kernel): if valid.min == 0 and valid.max == 1: valid_rendered = valid.render(self.render_ops, self) self.load_cache[key] = self.uop(UOps.ALU, localtype, (valid_rendered, self.load_cache[key], self.const(invalid_value, localtype)), TernaryOps.WHERE) + elif isinstance(buf.dtype, ImageDType): + buf_uop = self.buf_uops[i] + assert buf_uop is not None, f"buffer {i} wasn't UOped" + image_idx, valid = to_image_idx(buf.dtype.shape, idx, valid) + rendered_idx = self.uop(UOps.CAST, dtypes.int.vec(2), (image_idx[0].render(self.render_ops, self), image_idx[1].render(self.render_ops, self))) + valid_tuple = (valid.render(self.render_ops, self), self.const(invalid_value, dtypes.float32.vec(4))) if valid.min == 0 else tuple() + self.load_cache[key] = self.uop(UOps.LOAD, dtypes.float32.vec(4), (buf_uop, rendered_idx) + valid_tuple + ((barrier,) if barrier else ())) + idx_small = idx%4 + res = idx_small.render(self.render_ops, self) + if localtype == localtype.scalar(): + out = self.uop(UOps.GEP, localtype, (self.load_cache[key],), idx_small.max) + for ix in range(idx_small.max, idx_small.min, -1): + rvv = self.uop(UOps.GEP, localtype, (self.load_cache[key],), ix-1) + sel = self.uop(UOps.ALU, res.dtype, (res, self.const(ix)), BinaryOps.CMPLT) + out = self.uop(UOps.ALU, localtype, (sel, rvv, out), TernaryOps.WHERE) + self.load_cache[key] = out else: buf_uop = self.buf_uops[i] assert buf_uop is not None, f"buffer {i} wasn't UOped" - if isinstance(buf.dtype, ImageDType): - idx, valid = to_image_idx(buf.dtype.shape, idx, valid) - rendered_idx = self.uop(UOps.CAST, dtypes.int.vec(2), (idx[0].render(self.render_ops, self), idx[1].render(self.render_ops, self))) - else: - rendered_idx = idx.render(self.render_ops, self) - - if valid.min == 0: - valid_rendered = valid.render(self.render_ops, self) - self.load_cache[key] = self.uop(UOps.LOAD, localtype, (buf_uop, rendered_idx, valid_rendered, self.const(invalid_value, localtype)) + ((barrier,) if barrier else ())) - else: - self.load_cache[key] = self.uop(UOps.LOAD, localtype, (buf_uop, rendered_idx) + ((barrier,) if barrier else ())) + rendered_idx = idx.render(self.render_ops, self) + valid_tuple = (valid.render(self.render_ops, self), self.const(invalid_value, localtype)) if valid.min == 0 else tuple() + self.load_cache[key] = self.uop(UOps.LOAD, localtype, (buf_uop, rendered_idx) + valid_tuple + ((barrier,) if barrier else ())) ret.append(self.uop(UOps.GEP, localtype.scalar(), (self.load_cache[key],), rep_idx[dim]) if dim is not None else self.load_cache[key]) return ret @@ -383,7 +391,7 @@ class Linearizer(Kernel): for u in self.uops: if not loop_stack[-1]: loop_stack[-1].append(u) elif u.uop == UOps.LOOP: loop_stack.append([u]) - elif u.uop not in [UOps.CONST, UOps.ALU]: loop_stack[-1].append(u) + elif u.uop not in [UOps.CONST, UOps.ALU, UOps.CAST]: loop_stack[-1].append(u) else: parents = get_recursive_parents(u) for i in reversed(range(len(loop_stack))): diff --git a/tinygrad/device.py b/tinygrad/device.py index 55cdafd687..94e69b0a87 100644 --- a/tinygrad/device.py +++ b/tinygrad/device.py @@ -186,7 +186,6 @@ def _get_optimized_linearizer(linearizer_opts:LinearizerOptions, ast:LazyOp) -> if BEAM >= 1: lins = [(("tc" if used_tensor_cores else "hc"), k)] kb = Linearizer(ast, linearizer_opts) - kb.required_optimizations() from tinygrad.features.search import beam_search, time_linearizer, bufs_from_lin # TODO: this shouldn't use Device.DEFAULT, it should get the device from the LinearizerOptions test_rawbuffers = bufs_from_lin(kb) # allocate scratch buffers for optimization @@ -197,6 +196,4 @@ def _get_optimized_linearizer(linearizer_opts:LinearizerOptions, ast:LazyOp) -> timed = sorted([(nm, tk, time_linearizer(tk, test_rawbuffers, allow_test_size=False, clear_l2=True)) for nm, tk in lins], key=lambda x: x[2]) if DEBUG >= 1: print(" < ".join(f"{nm:6s} : {lin.colored_shape(30, dense=True)} : {tm*1e6:8.2f} us" for nm, lin, tm in timed)) k = timed[0][1] - else: - k.required_optimizations() return k \ No newline at end of file diff --git a/tinygrad/features/image.py b/tinygrad/features/image.py index 7356b7d73a..82d92f44ad 100644 --- a/tinygrad/features/image.py +++ b/tinygrad/features/image.py @@ -1,5 +1,5 @@ -from typing import List, Tuple, Dict, Any -from tinygrad.helpers import ImageDType, prod, IMAGE, getenv, dtypes, DEBUG, flatten +from typing import Tuple, Dict, Any +from tinygrad.helpers import prod, IMAGE, getenv, dtypes, DEBUG # *** image Tensor function replacements *** @@ -95,60 +95,6 @@ def image_conv2d(self, weight, bias=None, groups=1, stride=1, dilation=1, paddin ret = ret.reshape(bs, oy, ox, cout).permute(0,3,1,2) return ret if bias is None else ret.add(bias.reshape(1, -1, 1, 1)) -# *** schedules with images need to be fixed to be valid *** - -import dataclasses -from tinygrad.ops import ScheduleItem, BufferOps, LazyOp, UnaryOps, LoadOps, MemBuffer, get_lazyop_info - -def fix_schedule_for_images(schedule:List[ScheduleItem]): - # this is the fundamental fix, find unwritable or unreadable images and convert them to normal float32 (TODO: should it be float16?) - replace_inputs = {} - for i, si in enumerate(schedule): - if isinstance(si.out.dtype, ImageDType) and (prod(si.out.shape) != prod(si.out.dtype.shape) or not any(si.out.shape[x]%4 == 0 for x in si.out.st.unit_stride_axes())): - if DEBUG >= 1: print(f"{i:3d}: rewrite output, output shape {prod(si.out.shape)}, image dtype {si.out.dtype} prod {prod(si.out.dtype.shape)}") - si.out.dtype = dtypes.float32 - for b in si.ast.get_lazyops(): - if b.op != BufferOps.LOAD: continue - if isinstance(si.inputs[b.arg.idx-1].dtype, ImageDType) and not any(b.arg.st.shape[x]%4 == 0 for x in b.arg.st.unit_stride_axes()): - if DEBUG >= 1: print(f"{i:3d}: rewrite input, image dtype {si.inputs[b.arg.idx-1].dtype}, {b.arg.st.views}") - if si.inputs[b.arg.idx-1].realized: - # have to copy it - replace_inputs[si.inputs[b.arg.idx-1]] = si.inputs[b.arg.idx-1].cast(dtypes.float32) - else: - # change it before it's created - si.inputs[b.arg.idx-1].dtype = dtypes.float32 - - # now fix up the schedule to reflect the new dtypes - fixed_schedule:List[ScheduleItem] = [] - for i,si in enumerate(schedule): - ast = si.ast - inputs = si.inputs - - # replace inputs with casted versions - if any(x in replace_inputs for x in inputs): - fixed_schedule += flatten([replace_inputs[x].schedule() for x in inputs if x in replace_inputs]) - inputs = tuple(replace_inputs.get(x, x) for x in inputs) - - # fix input dtypes to match what they actually are - replacements = {} - for b in si.ast.get_lazyops(): - if b.op != BufferOps.LOAD: continue - if b.arg.dtype != inputs[b.arg.idx-1].dtype: - replacements[b] = LazyOp(BufferOps.LOAD, (), MemBuffer(b.arg.idx, inputs[b.arg.idx-1].dtype, b.arg.st)) - if replacements: ast = ast.map_buffers(replacements) - - # fix the ops to create the output dtype - if ast.op not in LoadOps: - info = get_lazyop_info(ast) - if info.dtype != si.out.dtype: - if DEBUG >= 3: print(f"{i:3d}: info.dtype {info.dtype} != {si.out.dtype} -> {si.out.dtype}") - ast_cast = LazyOp(UnaryOps.CAST, (ast.src[0],), (si.out.dtype, False)) - ast = LazyOp(BufferOps.STORE, (ast_cast,), MemBuffer(0, si.out.dtype, ast.arg.st)) - - # put this in the fixed schedule - fixed_schedule.append(dataclasses.replace(si, ast=ast, inputs=inputs)) - return fixed_schedule - # *** images have weird indexing requirements *** from tinygrad.shape.symbolic import Node, AndNode, Variable, NumNode, SumNode, LtNode @@ -178,6 +124,7 @@ def to_image_idx(base_shape:Tuple[int, ...], idxy:Node, valid:Node) -> Tuple[Tup fakes = {} for cnt, (key_node, (mnn, mxn, multip)) in enumerate(val_dict.items()): + if mnn > mxn: return (idx, idy), valid # TODO: why is this happening? fake_var = Variable("fake_" + str(cnt), mnn, mxn) fakes[fake_var] = key_node idxy += multip*(fake_var - key_node) diff --git a/tinygrad/lazy.py b/tinygrad/lazy.py index c100005b80..b618af97b4 100644 --- a/tinygrad/lazy.py +++ b/tinygrad/lazy.py @@ -4,7 +4,7 @@ from typing import Callable, Optional, Tuple, Union, List, Dict, Any, cast, Mapp from weakref import ref, WeakSet, WeakValueDictionary import numpy as np -from tinygrad.helpers import prod, getenv, DType, dtypes, flatten, dedup, merge_dicts, all_int, ImageDType +from tinygrad.helpers import prod, getenv, DType, dtypes, flatten, dedup, merge_dicts, all_int, ImageDType, DEBUG from tinygrad.ops import ScheduleItem, UnaryOps, BinaryOps, TernaryOps, ReduceOps, MovementOps, LoadOps, OpType, LazyOp, MemBuffer, ConstBuffer, BufferOps, get_lazyop_info from tinygrad.shape.shapetracker import ShapeTracker, get_contraction from tinygrad.shape.symbolic import Variable, sint @@ -172,9 +172,15 @@ class LazyBuffer: if op.op not in LoadOps: info = get_lazyop_info(op) assert info.dtype == self.dtype or isinstance(self.dtype, ImageDType), f"dtype mismatch {info.dtype=} != {self.dtype=}" + + if isinstance(self.dtype, ImageDType) and (prod(self.shape) != prod(self.dtype.shape) or not any(self.shape[x]%4 == 0 for x in self.st.unit_stride_axes())): + if DEBUG >= 3: print(f"forcing image {self.dtype} to float32") + self.dtype = dtypes.float32 # NOTE; this is what makes the dtype above not match + op = LazyOp(UnaryOps.CAST, (op, ), (dtypes.float32, False)) + # TODO: why doesn't this match? #assert info.shape == self.shape, f"shape mismatch {info.shape=} != {self.shape=}" - op = LazyOp(BufferOps.STORE, (op, ), MemBuffer(0, info.dtype, ShapeTracker.from_shape(info.shape))) + op = LazyOp(BufferOps.STORE, (op, ), MemBuffer(0, self.dtype, ShapeTracker.from_shape(info.shape))) return ret + [ScheduleItem(op, self, tuple(base_bufs), {k:var_vals[k] for k in vars_from_ast(op)})] diff --git a/tinygrad/realize.py b/tinygrad/realize.py index 983ffe4749..2dd39073c6 100644 --- a/tinygrad/realize.py +++ b/tinygrad/realize.py @@ -4,13 +4,9 @@ from tinygrad.ops import ScheduleItem, LazyOp, LoadOps, BufferOps from tinygrad.device import Device from tinygrad.graph import log_schedule_item, print_tree from tinygrad.lazy import LazyBuffer -from tinygrad.helpers import DEBUG, prod, all_int, IMAGE, getenv -from tinygrad.features.image import fix_schedule_for_images +from tinygrad.helpers import DEBUG, prod, all_int, getenv def run_schedule(schedule:List[ScheduleItem], disable_logging=False): - # HACK: images can be not usable due to shape - if IMAGE >= 2: schedule = fix_schedule_for_images(schedule) - # NOTE: if you for loop the schedule it's slow because nothing frees while len(schedule): si = schedule.pop(0) diff --git a/tinygrad/renderer/cstyle.py b/tinygrad/renderer/cstyle.py index e32c89db2d..d6646ebd6f 100644 --- a/tinygrad/renderer/cstyle.py +++ b/tinygrad/renderer/cstyle.py @@ -52,7 +52,7 @@ class CStyleLanguage(NamedTuple): def render_const(self, x:Union[float,int], var_dtype) -> str: if math.isnan(x): val = "NAN" elif math.isinf(x): val = ("-" if x < 0 else "") + "INFINITY" - else: val = f"{x}f" if dtypes.is_float(var_dtype) and isinstance(x, float) else f"{int(x)}" + else: val = f"{float(x)}f" if dtypes.is_float(var_dtype) else f"{int(x)}" return self.render_cast([val]*var_dtype.sz, var_dtype) if var_dtype.sz > 1 else val # returns a str expression of the loaded value with the output type diff --git a/tinygrad/shape/symbolic.py b/tinygrad/shape/symbolic.py index a312f21003..77a8e4e3f1 100644 --- a/tinygrad/shape/symbolic.py +++ b/tinygrad/shape/symbolic.py @@ -132,7 +132,7 @@ class Node: class Variable(Node): def __new__(cls, expr:Optional[str], nmin:int, nmax:int): - assert nmin >= 0 and nmin <= nmax + assert nmin >= 0 and nmin <= nmax, f"invalid Variable {expr=} {nmin=} {nmax=}" if nmin == nmax: return NumNode(nmin) return super().__new__(cls)