diff --git a/docs/abstractions.py b/docs/abstractions.py
index dfbb71fee5..5d096a52f2 100644
--- a/docs/abstractions.py
+++ b/docs/abstractions.py
@@ -263,14 +263,13 @@ class Linearizer:
   uops: List[UOp]
 
 from tinygrad.tensor import Tensor
-from tinygrad.helpers import prod
 result = Tensor(2).realize() + Tensor(3).realize()
-result.lazydata.realized = Device[Device.DEFAULT].buffer(prod(result.shape), result.dtype)
 
 # use the real Linearizer to linearize 2+3
+from tinygrad.lazy import _replace_loadops
 from tinygrad.codegen.linearizer import Linearizer
-from tinygrad.codegen.kernel import LinearizerOptions
-linearizer = Linearizer(result.lazydata.op, result.lazydata, LinearizerOptions())
+op, _ = _replace_loadops(result.lazydata.op)
+linearizer = Linearizer(op)
 linearizer.linearize()
 
 # print the uops
@@ -279,13 +278,11 @@ for uop in linearizer.uops: print(uop)
 # output:
 """
    0 UOps.DEFINE_GLOBAL  : ptr.dtypes.float          []                               ('data0', dtypes.float)
-   1 UOps.LOOP           :                           []                               ([], 'global')
-   2 UOps.LOOP           :                           []                               ([], 'local')
-   3 UOps.CONST          : dtypes.float              []                               2.0
-   4 UOps.CONST          : dtypes.float              []                               3.0
-   5 UOps.ALU            : dtypes.float              [3, 4]                           BinaryOps.ADD
-   6 UOps.STORE          :                           [5]                              MemOp(name='data0', idx=<0>, local=False, memory_dtype=dtypes.float, valid=<1>, invalid_value=0.0)
-   7 UOps.ENDLOOP        :                           []                               ([], 'global+local')
+   1 UOps.CONST          : dtypes.float              []                               2.0
+   2 UOps.CONST          : dtypes.float              []                               3.0
+   3 UOps.ALU            : dtypes.float              [1, 2]                           BinaryOps.ADD
+   4 UOps.CONST          : dtypes.int                []                               0
+   5 UOps.STORE          :                           [0, 4, 3]                        None
 """
 
 # %%
diff --git a/test/models/test_real_world.py b/test/models/test_real_world.py
index 0e53f88de0..b280869e92 100644
--- a/test/models/test_real_world.py
+++ b/test/models/test_real_world.py
@@ -14,17 +14,17 @@ from examples.hlb_cifar10 import SpeedyResNet
 from examples.llama import Transformer as LLaMaTransformer, MODEL_PARAMS as LLAMA_MODEL_PARAMS
 from examples.stable_diffusion import UNetModel
 
-def kopt_search_hook(k, create_k, to_prg, baseline):
+def kopt_search_hook(k, create_k, to_prg, baseline, bufs):
   import nevergrad as ng
-  wanna_output = k.bufs[0].toCPU().copy()
+  wanna_output = bufs[0].toCPU().copy()
   def check_opt(x):
     try:
       k = create_k()
       k.process()
       k.apply_auto_opt(x)
       prg = to_prg(k)
-      first_tm = prg.exec(k.bufs, force_wait=True, optimizing=True)
-      np.testing.assert_allclose(wanna_output, k.bufs[0].toCPU(), atol=1e-4, rtol=1e-4)
+      first_tm = prg.exec(bufs, force_wait=True, optimizing=True)
+      np.testing.assert_allclose(wanna_output, bufs[0].toCPU(), atol=1e-4, rtol=1e-4)
       return first_tm
     except Exception:
       return 10000_000   # 10000 seconds is infinity
diff --git a/test/test_custom_function.py b/test/test_custom_function.py
index 2644d781c9..d41aeb65c2 100644
--- a/test/test_custom_function.py
+++ b/test/test_custom_function.py
@@ -24,7 +24,7 @@ def atan2_gpu(ret:LazyBuffer, a:LazyBuffer, b:LazyBuffer):
     __kernel void atan2_gpu(global float *c, global float *a, global float *b) {
       int idx = get_global_id(0);
       c[idx] = atan2(a[idx], b[idx]);
-    }""", global_size=[prod(ret.shape)]).build(Device[ret.device].runtime).exec([ret, a, b])
+    }""", global_size=[prod(ret.shape)]).build(Device[ret.device].runtime).exec([ret.realized, a.realized, b.realized])
   return ret.realized
 
 def atan2_cpu(ret:LazyBuffer, a:LazyBuffer, b:LazyBuffer):
diff --git a/test/test_linearizer.py b/test/test_linearizer.py
index 199c2a821c..9da77cddb0 100644
--- a/test/test_linearizer.py
+++ b/test/test_linearizer.py
@@ -5,6 +5,7 @@ from tinygrad.codegen.linearizer import Linearizer, UOps
 from tinygrad.ops import Compiled, Device, MovementOps, LazyOp
 from tinygrad.tensor import Tensor
 from tinygrad.jit import CacheCollector
+from tinygrad.lazy import _replace_loadops
 
 class TestLinearizer(unittest.TestCase):
   def test_arg_dedup(self):
@@ -30,7 +31,7 @@ class TestLinearizer(unittest.TestCase):
     r = a[:-1] + a[1:]
     ast = r.lazydata.op
     r = r.realize()  # realize an output buffer
-    k = Linearizer(ast, r.lazydata, Device[Device.DEFAULT].linearizer_opts)
+    k = Linearizer(_replace_loadops(ast)[0], Device[Device.DEFAULT].linearizer_opts)
     k.process()
     k.upcast()
     k.linearize()
@@ -48,7 +49,7 @@ class TestLinearizer(unittest.TestCase):
     r = a.expand([2]) + b.expand([2])
     ast = r.lazydata.op
     r = r.realize()  # realize an output buffer
-    k = Linearizer(ast, r.lazydata, Device[Device.DEFAULT].linearizer_opts)
+    k = Linearizer(_replace_loadops(ast)[0], Device[Device.DEFAULT].linearizer_opts)
     k.process()
     k.upcast()
     k.linearize()
@@ -63,7 +64,7 @@ class TestLinearizer(unittest.TestCase):
     r = Tensor.stack([a, b])
     ast = r.lazydata.op
     r = r.realize()  # realize an output buffer
-    k = Linearizer(ast, r.lazydata, Device[Device.DEFAULT].linearizer_opts)
+    k = Linearizer(_replace_loadops(ast)[0], Device[Device.DEFAULT].linearizer_opts)
     k.process()
     k.upcast()
     k.linearize()
@@ -79,7 +80,7 @@ class TestLinearizer(unittest.TestCase):
     r = a * b
     ast = r.lazydata.op
     r = r.realize()  # realize an output buffer
-    k = Linearizer(ast, r.lazydata, Device[Device.DEFAULT].linearizer_opts)
+    k = Linearizer(_replace_loadops(ast)[0], Device[Device.DEFAULT].linearizer_opts)
     k.process()
     k.linearize()
     num_ops = len([uop for uop in k.uops if uop.uop in [UOps.LOAD, UOps.ALU]])
@@ -88,12 +89,14 @@ class TestLinearizer(unittest.TestCase):
 def helper_linearizer_opt(r:Tensor, opts=[]):
   wanna_output = None
   realized_ast = None
+  real_bufs = None
 
   # HACK to get real ast.
   real_dev_exec_ast = Device[Device.DEFAULT].exec_ast
-  def fake_exec_ast(ast, output=None, **kwargs):
-    nonlocal realized_ast
-    x = real_dev_exec_ast(ast, output, **kwargs)
+  def fake_exec_ast(ast, output=None, inputs=None, **kwargs):
+    nonlocal realized_ast, real_bufs
+    x = real_dev_exec_ast(ast, output, inputs, **kwargs)
+    real_bufs = [output.realized] + inputs
     if not(ast.op in MovementOps and ast.src[0].__class__ is not LazyOp and ast.src[0].realized): realized_ast = ast # get last executed
     return x
   Device[Device.DEFAULT].exec_ast = fake_exec_ast
@@ -106,26 +109,26 @@ def helper_linearizer_opt(r:Tensor, opts=[]):
     k.process()
     k.apply_auto_opt(x)
     prg = to_prg(k)
-    k.bufs[0].realized = k.bufs[0].realized.fromCPU(np.zeros(k.bufs[0].shape, dtype=k.bufs[0].dtype.np)) # Zero to check that all values are filled
-    prg.exec(k.bufs, force_wait=True)
-    np.testing.assert_allclose(wanna_output, k.bufs[0].toCPU(), atol=1e-4, rtol=1e-4)
+    real_bufs[0] = real_bufs[0].fromCPU(np.zeros((real_bufs[0].size, ), dtype=real_bufs[0].dtype.np)) # Zero to check that all values are filled
+    prg.exec(real_bufs, force_wait=True)
+    np.testing.assert_allclose(wanna_output, real_bufs[0].toCPU(), atol=1e-4, rtol=1e-4)
 
   # Get baseline, which is not optimized at all.
-  k = Linearizer(realized_ast, r.lazydata, Device[Device.DEFAULT].linearizer_opts)
+  k = Linearizer(realized_ast, Device[Device.DEFAULT].linearizer_opts)
   k.process()
   prg = Device[Device.DEFAULT].to_program(k)
-  prg.exec(k.bufs, force_wait=True)
-  wanna_output = k.bufs[0].toCPU().copy()
+  prg.exec(real_bufs, force_wait=True)
+  wanna_output = real_bufs[0].toCPU().copy()
 
   # Check correctness of handcoded optimiztions.
-  k = Linearizer(realized_ast, r.lazydata, Device[Device.DEFAULT].linearizer_opts)
+  k = Linearizer(realized_ast, Device[Device.DEFAULT].linearizer_opts)
   k.hand_coded_optimizations()
   prg = Device[Device.DEFAULT].to_program(k)
-  k.bufs[0].realized = k.bufs[0].realized.fromCPU(np.zeros(k.bufs[0].shape, dtype=k.bufs[0].dtype.np)) # Zero to check that all values are filled
-  prg.exec(k.bufs, force_wait=True)
-  np.testing.assert_allclose(wanna_output, k.bufs[0].toCPU(), atol=1e-4, rtol=1e-4)
+  real_bufs[0] = real_bufs[0].fromCPU(np.zeros((real_bufs[0].size, ), dtype=real_bufs[0].dtype.np)) # Zero to check that all values are filled
+  prg.exec(real_bufs, force_wait=True)
+  np.testing.assert_allclose(wanna_output, real_bufs[0].toCPU(), atol=1e-4, rtol=1e-4)
   for x in opts: # Check custom transformations if any.
-    check_opt(x, lambda: Linearizer(realized_ast, r.lazydata, Device[Device.DEFAULT].linearizer_opts), Device[Device.DEFAULT].to_program)
+    check_opt(x, lambda: Linearizer(realized_ast, Device[Device.DEFAULT].linearizer_opts), Device[Device.DEFAULT].to_program)
 
 class TestLinearizerOpts(unittest.TestCase):
   def test_local_and_grouped_reduce(self):
diff --git a/test/unit/test_flopcounter.py b/test/unit/test_flopcounter.py
index 14fcc8c8c8..69507d52b4 100644
--- a/test/unit/test_flopcounter.py
+++ b/test/unit/test_flopcounter.py
@@ -1,18 +1,13 @@
 #!/usr/bin/env python
 import unittest
-from typing import NamedTuple, Tuple
-from tinygrad.ops import LazyOp, BinaryOps, ReduceOps, get_lazyop_info
-from tinygrad.helpers import DType, dtypes
-
-class TestBuffer(NamedTuple):
-  __test__ = False # To prevent pytest from collecting this as a test
-  shape: Tuple[int, ...]
-  dtype: DType
+from tinygrad.ops import LazyOp, BinaryOps, ReduceOps, get_lazyop_info, LoadOps, MemBuffer
+from tinygrad.shape.view import View
+from tinygrad.helpers import dtypes
 
 class TestFlopCounter(unittest.TestCase):
   def setUp(self):
-    self.buf0 = TestBuffer(shape=(4,), dtype=dtypes.float32)
-    self.buf1 = TestBuffer(shape=(4,), dtype=dtypes.float32)
+    self.buf0 = LazyOp(LoadOps.BUFFER, (), MemBuffer(1, dtypes.float32, (View.create((4,)),)))
+    self.buf1 = LazyOp(LoadOps.BUFFER, (), MemBuffer(2, dtypes.float32, (View.create((4,)),)))
 
   def test_flops_add(self):
     op0 = LazyOp(BinaryOps.ADD, (self.buf0,self.buf1,), None)
diff --git a/tinygrad/codegen/kernel.py b/tinygrad/codegen/kernel.py
index db6114118e..7b83e095d9 100644
--- a/tinygrad/codegen/kernel.py
+++ b/tinygrad/codegen/kernel.py
@@ -1,12 +1,10 @@
 from typing import NamedTuple, Optional, List, Tuple, cast, Dict
 import itertools
-from tinygrad.ops import LazyOp, MovementOps, FlopCounter, get_lazyop_info, ReduceOps
-from tinygrad.lazy import LazyBuffer
-from tinygrad.helpers import dedup, dtypes, colored, ImageDType, DType
-from tinygrad.runtime.lib import buf_is_kernel_arg
+from tinygrad.ops import LazyOp, FlopCounter, get_lazyop_info, ReduceOps, LoadOps, MemBuffer
+from tinygrad.helpers import dedup, dtypes, colored, ImageDType, DType, all_int
 from tinygrad.shape.shapetracker import ShapeTracker
 from tinygrad.shape.symbolic import sint
-from tinygrad.shape.view import strides_for_shape
+from tinygrad.shape.view import strides_for_shape, View
 
 class LocalBuffer(NamedTuple):
   name: str
@@ -16,6 +14,7 @@ class LocalBuffer(NamedTuple):
   def __str__(self): return f"localbuffer<{self.name}[{self.size}]>"
 
 class LinearizerOptions(NamedTuple):
+  device: str = ""
   # TODO: make this generic with a list of supported types
   supports_float4: bool = True
   supports_float4_alu: bool = True
@@ -26,41 +25,32 @@ class LinearizerOptions(NamedTuple):
   local_max: Optional[List[int]] = None
 
 class Kernel:
-  def __init__(self, ast:LazyOp, output_buffer:LazyBuffer, opts:Optional[LinearizerOptions]=None):
-    # NOTE: if there's a RESHAPE, we skip it. the output shape is set from the reduce op or a latebuf
-    self.ast = ast.src[0] if ast.op == MovementOps.RESHAPE else ast
+  def __init__(self, ast:LazyOp, opts:Optional[LinearizerOptions]=None, var_vals=None):
     self.opts = opts if opts else LinearizerOptions()
-
-    # get the output buffers
-    self.bufs = [output_buffer] + dedup(ast.buffers)
-    self.arg_bufs = {x:f"data{i}" for i,x in enumerate(dedup([x.realized for x in self.bufs if buf_is_kernel_arg(x)]))}
-
-    # key for lookup in cache (can change, str might not be right)
-    # bufs are needed because kernels like f(x) = x + x and f(x, y) = x + y have the same str(ast), but are different kernels.
-    # mapping the buffers to integers is required because a-b != b-a (and how would you tell a and b apart?)
-    self.key = (ast.map_buffers({x:self.arg_bufs.get(x.realized,x) for x in self.bufs}).key, tuple([x.key for x in self.bufs]))
+    self.ast = ast
+    self.var_vals = var_vals
+    self.key = (ast, tuple(var_vals.keys())) if var_vals else ast
 
   def process(self) -> None:
     if hasattr(self, "sts"): return   # already processed
 
     # fetch lazyop info
     self.info: FlopCounter = get_lazyop_info(cast(LazyOp, self.ast))
-    self.mem_estimate: int = sum(x.dtype.itemsize*x.size for x in self.arg_bufs.keys())
 
     # there's only allowed to be one reduceop
     reduceops = [x for x in self.ast.get_lazyops() if x.op in ReduceOps]
     assert len(dedup(reduceops)) <= 1, "max one reduce op in an ast"
     self.reduceop = reduceops[0] if reduceops else None
 
-    # get earlybufs, before the one reduce op
-    self.earlybufs = dedup(self.reduceop.buffers) if self.reduceop else []
-
     # create new shapetrackers inside this kernel, we will permute them
-    self.sts: List[ShapeTracker] = [x.st.copy() for x in self.bufs]
+    self.bufs = [MemBuffer(0, self.info.dtype, (View.create(self.info.shape),))] + [x.arg for x in self.ast.get_lazyops() if x.op in LoadOps]
+    self.sts: List[ShapeTracker] = [ShapeTracker(x.views[-1].shape, views=list(x.views)) for x in self.bufs]
     for st in self.sts: st.simplify()
 
-    # make the output buffer shape correct in here
-    self.sts[0].reshape(self.info.shape)
+    self.mem_estimate: int = sum(x.dtype.itemsize*x.views[-1].size() for x in self.bufs)
+
+    # get earlybufs, before the one reduce op
+    self.earlybufs = [x.arg for x in self.reduceop.get_lazyops() if x.op in LoadOps] if self.reduceop else []
     self.full_buf_index: int = self.bufs.index(self.earlybufs[0]) if self.earlybufs else 0
 
     # parameters
@@ -77,7 +67,7 @@ class Kernel:
 
   def has_variable_shape(self) -> bool:
     for b in self.bufs:
-      if any(not isinstance(x, int) for x in b.st.shape): return True
+      if not all_int(b.views[-1].shape): return True
     return False
 
   def shape_offsets(self, i): return itertools.product(*[list(range(s)) for s in self.sts[i].shape[self.shape_len-self.upcasted:][::-1]]) if self.upcasted > 0 else [tuple()]
@@ -147,6 +137,6 @@ class Kernel:
   def colored_shape(self) -> str: return ' '.join(colored(s, color) for s,color in zip([f"{s:4d}" if isinstance(s, int) else s for s in self.full_shape], self.colors()))
   def printbufs(self, prefix=""):
     for i,st in enumerate(self.sts):
-      print(prefix, f"{i:3d} {str(self.bufs[i].realized) if self.bufs[i].realized is not None else str(self.bufs[i]):47s}", st.views)
+      print(prefix, f"{i:3d} {str(self.bufs[i]):47s}", st.views)
     print(self.colored_shape())
 
diff --git a/tinygrad/codegen/linearizer.py b/tinygrad/codegen/linearizer.py
index 4ff10f7675..cd47c586f6 100644
--- a/tinygrad/codegen/linearizer.py
+++ b/tinygrad/codegen/linearizer.py
@@ -5,9 +5,8 @@ from collections import defaultdict
 from enum import Enum, auto
 
 from tinygrad.helpers import colored, ImageDType, DEBUG, dtypes, DType, prod, PtrDType, all_same
-from tinygrad.ops import LazyOp, UnaryOps
+from tinygrad.ops import LazyOp, UnaryOps, LoadOps, ConstBuffer, MemBuffer
 from tinygrad.ops import ReduceOps, BinaryOps, TernaryOps
-from tinygrad.runtime.lib import RawConst
 from tinygrad.shape.shapetracker import ShapeTracker
 from tinygrad.shape.symbolic import Variable, NumNode, Node, SumNode, MulNode, DivNode, ModNode, LtNode, AndNode, sym_rename
 from tinygrad.codegen.optimizer import OptimizedKernel
@@ -128,11 +127,6 @@ def get_grouped_dims(prefix, start_dim, local_dims, maxdim:int=0):
   return local_idxs, [x for x in loop_local_idxs if not isinstance(x, NumNode)]
 
 class Linearizer(OptimizedKernel):
-  def get_buffer_name(self, i):
-    if self.bufs[i].__class__ == LocalBuffer: return self.bufs[i].name
-    assert self.bufs[i].realized.__class__ is not RawConst  # constants shouldn't be loaded with memops
-    return self.arg_bufs[self.bufs[i].realized]
-
   def uop_alu_idx(self, a:UOp, b, ops, ctx:Linearizer, op, dtype=dtypes.int32):
     render_b:UOp = cast(UOp, (NumNode(b) if not isinstance(b, Node) else b).render(ops, ctx))
     return self.uop(UOps.ALU, dtype, (a, render_b), op)
@@ -147,7 +141,7 @@ class Linearizer(OptimizedKernel):
     AndNode: lambda self,ops,ctx: functools.reduce(lambda a,b: ctx.uop_alu_idx(a, b, ops, ctx, BinaryOps.MUL, dtype=dtypes.bool), self.nodes[1:], self.nodes[0].render(ops,ctx)) }
 
   def global_load(self, i:int, idxs:Sequence[VariableOrNum], acc=None) -> List[UOp]:
-    const = self.bufs[i].realized._buf if isinstance(self.bufs[i].realized, RawConst) else acc
+    const = self.bufs[i].val if isinstance(self.bufs[i], ConstBuffer) else acc
 
     expanded_nodes = [idx.expand() for idx in idxs]
     _idxs = [x[::-1] for x in itertools.product(*expanded_nodes[::-1])]
@@ -176,7 +170,7 @@ class Linearizer(OptimizedKernel):
         idx, valid = g_idx.substitute(substitute), g_valid.substitute(substitute)
         localtype = dtypes.float32
       this_const, idx, valid = (invalid_value, Variable.num(0), Variable.num(1)) if valid.max == 0 else (const, idx, valid)
-      key = f"{acc}{localtype}{this_const if this_const is not None and acc is None else self.get_buffer_name(i)}{idx.render()}{valid.render()}"
+      key = f"{acc}{localtype}{this_const if this_const is not None and acc is None else (self.bufs[i].idx if isinstance(self.bufs[i], MemBuffer) else self.bufs[i].name)}{idx.render()}{valid.render()}"
       if key not in self.load_cache:
         if acc is not None:
           assert valid.min == 1
@@ -253,15 +247,13 @@ class Linearizer(OptimizedKernel):
     self.loop_uops: Dict[str, UOp] = {}
 
     # add global buffers
-    arg_bufs = {}
-    for buf,name in self.arg_bufs.items():
-      arg_bufs[buf] = self.uop(UOps.DEFINE_GLOBAL, PtrDType(buf.dtype) if not isinstance(buf.dtype, ImageDType) else buf.dtype, (), (name, buf.dtype))
-    for i,b in enumerate(self.bufs):
-      if b.realized in arg_bufs: self.buf_uops[i] = arg_bufs[b.realized]
-    # add variables from symbolic shapes
-    for var in sorted(set(v for buf in self.ast.buffers for v in buf.var_vals), key=lambda k: k.key):
-      assert var.expr is not None
-      self.loop_uops[var.expr] = self.uop(UOps.DEFINE_GLOBAL, dtypes.int32, (), (var.expr, dtypes._arg_int32))
+    for i,buf in enumerate(self.bufs):
+      if isinstance(buf, MemBuffer):
+        self.buf_uops[i] = self.uop(UOps.DEFINE_GLOBAL, PtrDType(buf.dtype) if not isinstance(buf.dtype, ImageDType) else buf.dtype, (), (f"data{buf.idx}", buf.dtype))
+    if self.var_vals:
+      for var in sorted(set(self.var_vals), key=lambda k: k.key):
+        assert var.expr is not None
+        self.loop_uops[var.expr] = self.uop(UOps.DEFINE_GLOBAL, dtypes.int32, (), (var.expr, dtypes._arg_int32))
     # define local buffers
     for lb in self.local_alias.values():
       self.buf_uops[self.bufs.index(lb)] = self.uop(UOps.DEFINE_LOCAL, PtrDType(dtypes.float32), (), (lb.name, self.sts[self.bufs.index(lb)].size()))
@@ -491,7 +483,8 @@ class Linearizer(OptimizedKernel):
     return self.uops[-1]
 
   def ast_parse(self, x, acc, loaded_buffers, do_reduce=False) -> List[UOp]:
-    if x.__class__ is not LazyOp: return loaded_buffers[x]
+    if x.__class__ is not LazyOp: return loaded_buffers[x]    # for LOCAL_BUFFER
+    if x.op in [LoadOps.BUFFER, LoadOps.CONST]: return loaded_buffers[x.arg]
     if x.op in [UnaryOps.NOOP, UnaryOps.CAST]: return self.ast_parse(x.src[0], acc, loaded_buffers)  # cast isn't an ALU op
     if x.op in ReduceOps and not do_reduce: return acc
     # MULACC fusion. TODO: this is copied from Interpreted
diff --git a/tinygrad/codegen/optimizer.py b/tinygrad/codegen/optimizer.py
index 8925df077e..bf939bc2b0 100644
--- a/tinygrad/codegen/optimizer.py
+++ b/tinygrad/codegen/optimizer.py
@@ -1,9 +1,8 @@
 from typing import Tuple, List, cast, Optional
 import itertools, math, os
 from tinygrad.helpers import DEBUG, prod, getenv, ImageDType, dtypes
-from tinygrad.ops import ReduceOps, BinaryOps, UnaryOps, LazyOp
+from tinygrad.ops import ReduceOps, BinaryOps, UnaryOps, LazyOp, LoadOps
 from tinygrad.codegen.kernel import Kernel, LocalBuffer
-from tinygrad.lazy import LazyBuffer
 from tinygrad.shape.shapetracker import ShapeTracker
 from tinygrad.shape.view import View
 
@@ -167,11 +166,11 @@ class OptimizedKernel(Kernel):
     self.simplify_ones()
 
     # should use HIP tensor cores?
-    if getenv("TC", 1) != 0 and self.bufs[0].device == "HIP" and self.reduceop and self.reduceop.op == ReduceOps.SUM and \
+    if getenv("TC", 1) != 0 and self.opts.device == "HIP" and self.reduceop and self.reduceop.op == ReduceOps.SUM and \
         isinstance(self.reduceop.src[0], LazyOp) and self.reduceop.src[0].op == UnaryOps.CAST and \
         isinstance(self.reduceop.src[0].src[0], LazyOp) and self.reduceop.src[0].src[0].op == BinaryOps.MUL and \
-        isinstance(self.reduceop.src[0].src[0].src[0], LazyBuffer) and isinstance(self.reduceop.src[0].src[0].src[1], LazyBuffer) and self.opts.has_local and \
-        self.reduceop.src[0].src[0].src[0].dtype == dtypes.half and self.reduceop.src[0].src[0].src[1].dtype == dtypes.half:
+        self.reduceop.src[0].src[0].src[0].op == LoadOps.BUFFER and self.reduceop.src[0].src[0].src[1].op == LoadOps.BUFFER and self.opts.has_local and \
+        cast(LazyOp, self.reduceop.src[0].src[0].src[0]).arg.dtype == dtypes.half and cast(LazyOp, self.reduceop.src[0].src[0].src[1]).arg.dtype == dtypes.half:
       # HIP tensor cores are 16x16x16
       buf0 = self.bufs.index(self.reduceop.src[0].src[0].src[0])
       buf1 = self.bufs.index(self.reduceop.src[0].src[0].src[1])
@@ -227,10 +226,10 @@ class OptimizedKernel(Kernel):
 
     # should use METAL tensor cores?
     # first, confirm it's a straightforward mulacc on a device with real locals
-    tensor_cores_allowed = getenv("TC", 1) != 0 and (getenv("TC", 1) == 2 or (self.bufs[0].device == "METAL" and os.uname().machine == "arm64"))
+    tensor_cores_allowed = getenv("TC", 1) != 0 and (getenv("TC", 1) == 2 or (self.opts.device == "METAL" and os.uname().machine == "arm64"))
     if tensor_cores_allowed and self.reduceop and self.reduceop.op == ReduceOps.SUM and \
         isinstance(self.reduceop.src[0], LazyOp) and self.reduceop.src[0].op == BinaryOps.MUL and \
-        isinstance(self.reduceop.src[0].src[0], LazyBuffer) and isinstance(self.reduceop.src[0].src[1], LazyBuffer) and self.opts.has_local:
+        self.reduceop.src[0].src[0].op == LoadOps.BUFFER and self.reduceop.src[0].src[1] == LoadOps.BUFFER and self.opts.has_local:
       # METAL tensor cores are 8x8x8, with 2 elements per thread in the 32 thread warp
       buf0 = self.bufs.index(self.reduceop.src[0].src[0])
       buf1 = self.bufs.index(self.reduceop.src[0].src[1])
diff --git a/tinygrad/codegen/search.py b/tinygrad/codegen/search.py
index 9f6299bafe..1bdeda5029 100644
--- a/tinygrad/codegen/search.py
+++ b/tinygrad/codegen/search.py
@@ -21,7 +21,7 @@ def kernel_optimize_opts(k:Linearizer):
     opts.append(ng.p.TransitionChoice([(i,s,"G") for s in get_divisors(k.full_shape[k.first_reduce+i], min_div=4) if all(st.shape[k.first_reduce+i] % s == 0 or st.shape[k.first_reduce+i] == 1 for st in k.sts)]))
   return opts
 
-def kernel_optimize_search(k:Linearizer, create_k:Callable[[], Linearizer], to_prg, baseline):
+def kernel_optimize_search(k:Linearizer, create_k:Callable[[], Linearizer], to_prg, baseline, bufs):
   import nevergrad as ng
   def opt(x):
     try:
@@ -29,9 +29,9 @@ def kernel_optimize_search(k:Linearizer, create_k:Callable[[], Linearizer], to_p
       k.process()
       k.apply_auto_opt(x)
       prg = to_prg(k)
-      first_tm = prg.exec(k.bufs, force_wait=True, optimizing=True)
+      first_tm = prg.exec(bufs, force_wait=True, optimizing=True)
       if baseline*5 < first_tm*1000: return first_tm*1000  # very slow
-      tm = min([first_tm]+[prg.exec(k.bufs, force_wait=True, optimizing=True) for _ in range(2)])*1000
+      tm = min([first_tm]+[prg.exec(bufs, force_wait=True, optimizing=True) for _ in range(2)])*1000
       return tm
     except Exception:
       if DEBUG >= 3:
@@ -51,7 +51,7 @@ def kernel_optimize_search(k:Linearizer, create_k:Callable[[], Linearizer], to_p
 
 # optimization
 global_db = None
-def kernel_optimize(k:Linearizer, create_k:Callable[[], Linearizer], to_prg):
+def kernel_optimize(k:Linearizer, create_k:Callable[[], Linearizer], to_prg, bufs):
   global global_db
 
   k.process()
@@ -72,8 +72,8 @@ def kernel_optimize(k:Linearizer, create_k:Callable[[], Linearizer], to_prg):
       k = create_k()
       k.hand_coded_optimizations()
       prg = to_prg(k)
-      return min([prg.exec(k.bufs, force_wait=True, optimizing=True) for _ in range(5)])*1000
-    choice = kernel_optimize_search(k, create_k, to_prg, get_baseline())
+      return min([prg.exec(bufs, force_wait=True, optimizing=True) for _ in range(5)])*1000
+    choice = kernel_optimize_search(k, create_k, to_prg, get_baseline(), bufs)
     if global_db is not None:
       global_db[skey] = choice
       global_db.sync()
diff --git a/tinygrad/lazy.py b/tinygrad/lazy.py
index f9d511ef57..24d9687070 100644
--- a/tinygrad/lazy.py
+++ b/tinygrad/lazy.py
@@ -1,16 +1,16 @@
 from __future__ import annotations
 import sys, operator, math
-from typing import Callable, Optional, Tuple, Union, List, Dict, Any, cast
+from typing import Callable, Optional, Tuple, Union, List, Dict, Any, cast, Mapping
 from weakref import ref, WeakSet, WeakValueDictionary
 
 import numpy as np
 from tinygrad.graph import log_op
-from tinygrad.helpers import GRAPH, DEBUG, prod, getenv, DType, dtypes, flatten, ImageDType, partition, all_int
-from tinygrad.ops import Device, Compiled, UnaryOps, BinaryOps, TernaryOps, ReduceOps, MovementOps, LoadOps, OpType, LazyOp
+from tinygrad.helpers import GRAPH, DEBUG, prod, getenv, DType, dtypes, flatten, ImageDType, partition, all_int, dedup, merge_dicts
+from tinygrad.ops import Device, Compiled, UnaryOps, BinaryOps, TernaryOps, ReduceOps, MovementOps, LoadOps, OpType, LazyOp, MemBuffer, ConstBuffer
 from tinygrad.shape.shapetracker import ShapeTracker, View, get_contraction
 from tinygrad.shape.symbolic import Variable, sint
 
-from tinygrad.runtime.lib import RawConst, RawBuffer, RawBufferMapped, RawBufferTransfer
+from tinygrad.runtime.lib import RawConst, RawBuffer, RawBufferMapped, RawBufferTransfer, buf_is_kernel_arg
 from tinygrad.runtime.ops_cpu import RawNumpyBuffer
 from tinygrad.runtime.ops_disk import RawDiskBuffer
 
@@ -90,6 +90,20 @@ def _ast_binaryops(self:LazyBuffer) -> LazyOp:
   ast = self.op.map_buffers(cast(Dict[LazyBuffer, Union[LazyOp, LazyBuffer, str]], real_srcs))
   return LazyOp(MovementOps.RESHAPE, (ast, ), self.shape) if intermediate_shape != self.shape else ast
 
+def _replace_loadops(op:LazyOp) -> Tuple[LazyOp, List[LazyBuffer]]:
+  replacements:Dict[LazyBuffer, LazyOp] = {}
+  realized_bufs = dedup([x.realized for x in op.buffers if buf_is_kernel_arg(x)])
+  for x in op.buffers:
+    assert x.realized, "buffer isn't realized"
+    x.st.simplify()
+    if isinstance(x.realized, RawConst):
+      replacements[x] = LazyOp(LoadOps.CONST, (), ConstBuffer(x.realized._buf, x.realized.dtype, tuple(x.st.views)))
+    elif x.realized in realized_bufs:
+      replacements[x] = LazyOp(LoadOps.BUFFER, (), MemBuffer(realized_bufs.index(x.realized)+1, x.realized.dtype, tuple(x.st.views)))
+    else:
+      raise NotImplementedError(f"not handled {x}")
+  return (op.src[0] if op.op == MovementOps.RESHAPE else op).map_buffers(replacements), realized_bufs
+
 # **** lazy operations ****
 
 def get_single_root(root:LazyBuffer) -> LazyBuffer: return get_single_root(cast(LazyBuffer, root.op.src[0])) if getattr(root, 'op', None) and len(root.op.src) == 1 else root
@@ -162,7 +176,9 @@ class LazyBuffer:
           else:
             self.op = LazyOp(UnaryOps.CAST, (self.op,), (dtypes.float32, False))
           self.dtype = dtypes.float32
-        self.realized = Device[self.device].exec_ast(self.op, output=self, **self._device_extra_args())
+        self.var_vals = dict(sorted(merge_dicts([buf.var_vals for buf in self.op.buffers]).items(), key=lambda kv:cast(Variable,kv[0]).key))
+        op, realized_bufs = _replace_loadops(self.op)
+        self.realized = Device[self.device].exec_ast(op, output=self, inputs=realized_bufs, var_vals=self.var_vals, **self._device_extra_args())
 
       assert self.realized and isinstance(self.realized, (RawConst, Device[self.device].buffer)), f"device mismatch on realized got {type(self.realized)} expected {self.device}"
       # HACK: allow hot casting of images
@@ -315,7 +331,7 @@ class LazyBuffer:
 
   @property
   def buffers(self) -> Tuple[LazyBuffer, ...]: return (self,)
-  def map_buffers(self, real_srcs: Dict[LazyBuffer, Union[LazyBuffer, LazyOp, str]]): return real_srcs.get(self, self)
+  def map_buffers(self, real_srcs: Mapping[LazyBuffer, Union[LazyBuffer, LazyOp, str]]): return real_srcs.get(self, self)
   def get_lazyops(self) -> List[LazyOp]: return []
   def replace_with_movement_ops(self: LazyBuffer, ops:List[Tuple[MovementOps, Any]]) -> LazyBuffer:
     y = self
diff --git a/tinygrad/ops.py b/tinygrad/ops.py
index 99dab39211..e4039c77a2 100644
--- a/tinygrad/ops.py
+++ b/tinygrad/ops.py
@@ -1,8 +1,10 @@
 from __future__ import annotations
 import time, importlib, inspect, functools, pathlib
 from enum import Enum, auto
-from typing import TYPE_CHECKING, Union, Type, Tuple, Any, List, Optional, Dict, Callable, cast
-from tinygrad.helpers import ansilen, prod, DEBUG, getenv, GlobalCounters, DType, colored, dedup, merge_dicts
+from typing import TYPE_CHECKING, Union, Type, Tuple, Any, List, Optional, Dict, Callable, cast, Mapping
+from tinygrad.helpers import ansilen, prod, DEBUG, getenv, GlobalCounters, DType, colored
+from tinygrad.shape.view import View
+from dataclasses import dataclass
 if TYPE_CHECKING: from tinygrad.lazy import LazyBuffer
 
 # these are the llops your accelerator must implement, along with toCpu
@@ -14,11 +16,23 @@ class BinaryOps(Enum): ADD = auto(); SUB = auto(); MUL = auto(); DIV = auto(); M
 class ReduceOps(Enum): SUM = auto(); MAX = auto() # noqa: E702
 class TernaryOps(Enum): MULACC = auto(); WHERE = auto() # noqa: E702
 class MovementOps(Enum): RESHAPE = auto(); PERMUTE = auto(); EXPAND = auto(); PAD = auto(); SHRINK = auto(); STRIDE = auto(); AS_STRIDED = auto() # noqa: E702
-class LoadOps(Enum): EMPTY = auto(); RAND = auto(); CONST = auto(); FROM = auto(); CONTIGUOUS = auto(); CUSTOM = auto() # noqa: E702
+class LoadOps(Enum): EMPTY = auto(); RAND = auto(); CONST = auto(); FROM = auto(); CONTIGUOUS = auto(); CUSTOM = auto(); BUFFER = auto() # noqa: E702
 
 Op = Union[UnaryOps, BinaryOps, ReduceOps, MovementOps, LoadOps, TernaryOps]
 OpType = Union[Type[UnaryOps], Type[BinaryOps], Type[ReduceOps], Type[MovementOps], Type[LoadOps], Type[TernaryOps]]
 
+@dataclass(frozen=True)
+class MemBuffer:
+  idx: int
+  dtype: DType
+  views: Tuple[View, ...]
+
+@dataclass(frozen=True)
+class ConstBuffer:
+  val: Any
+  dtype: DType
+  views: Tuple[View, ...]
+
 class LazyOp:
   __slots__ = "op", "src", "arg", "buffers", "__weakref__"
   op: Op
@@ -37,7 +51,7 @@ class LazyOp:
   @property
   def key(self): return (self.op, tuple(map(lambda x: getattr(x, "key", x), self.src)), getattr(self.arg, "key", self.arg))
 
-  def map_buffers(self, real_srcs: Dict[LazyBuffer, Union[LazyBuffer, LazyOp, str]]) -> LazyOp: return LazyOp(self.op, tuple([y.map_buffers(real_srcs) for y in self.src]), self.arg)
+  def map_buffers(self, real_srcs: Mapping[LazyBuffer, Union[LazyBuffer, LazyOp, str]]) -> LazyOp: return LazyOp(self.op, tuple([y.map_buffers(real_srcs) for y in self.src]), self.arg)
   def get_lazyops(self) -> List[LazyOp]: return [self] + [item for x in self.src for item in x.get_lazyops()]
 
   def replace_with_movement_ops(self:LazyOp, ops:List[Tuple[MovementOps, Tuple[Any, ...]]]) -> 'LazyBuffer':
@@ -87,9 +101,8 @@ Device = _Device()
 
 # **************** for Interpreted Buffers ****************
 
-def apply_shapetracker(fxn_for_op, ret, st):
-  st.simplify()   # TODO: this is generic for Compiled too
-  for v in st.views:
+def apply_shapetracker(fxn_for_op, ret, views):
+  for v in views:
     real_shape = tuple(y-x for x,y in v.mask) if v.mask else v.shape
     real_offset = v.offset + (sum(x*st for (x,_),st in zip(v.mask, v.strides)) if v.mask else 0)
     # first, we apply the offset
@@ -111,20 +124,22 @@ def apply_shapetracker(fxn_for_op, ret, st):
   return ret
 
 class Interpreted:
-  def __init__(self, buffer, fxn_for_op: Dict[Op, Callable], from_lazybuffer=None, to_underlying=lambda x: x._buf, from_underlying=None):
+  def __init__(self, buffer, fxn_for_op: Dict[Op, Callable], to_underlying=lambda x: x._buf, from_underlying=None):
     self.buffer, self.fxn_for_op, self.to_underlying = buffer, fxn_for_op, to_underlying
     self.from_underlying = buffer if from_underlying is None else from_underlying
-    self.from_lazybuffer = from_lazybuffer if from_lazybuffer is not None else lambda x: self.from_underlying(apply_shapetracker(self.fxn_for_op, self.to_underlying(x.realized), x.st))
     self.synchronize = lambda: None
     self.codegen = None
 
-  def exec_ast(self, ast:LazyOp, output=None, context=None, **kwargs):
+  def exec_ast(self, ast:LazyOp, output=None, inputs=None, var_vals=None, context=None, **kwargs):
+    if ast.op == LoadOps.BUFFER and LoadOps.BUFFER not in self.fxn_for_op:
+      assert inputs[ast.arg.idx-1].dtype == ast.arg.dtype, "dtype mismatch"
+      return self.from_underlying(apply_shapetracker(self.fxn_for_op, self.to_underlying(inputs[ast.arg.idx-1]), ast.arg.views))
     if TernaryOps.MULACC in self.fxn_for_op and ast.op == ReduceOps.SUM and isinstance(ast.src[0], LazyOp) and ast.src[0].op == BinaryOps.MUL:
       ast = LazyOp(TernaryOps.MULACC, ast.src[0].src, ast.arg)
     created_context = context is None
     if context is None: context = dict()
     if not created_context and ast in context: return context[ast]
-    srcs = [self.exec_ast(x, context=context, **kwargs) if isinstance(x, LazyOp) else self.from_lazybuffer(x) for x in ast.src]
+    srcs = [self.exec_ast(cast(LazyOp, x), inputs=inputs, context=context, **kwargs) for x in ast.src]
     if DEBUG >= 3: st = time.perf_counter()
     ret = self.from_underlying(self.fxn_for_op[ast.op](*([self.to_underlying(x) for x in srcs] + ([ast.arg] if ast.arg is not None else []))))
     if output is not None and ret.dtype != output.dtype and UnaryOps.CAST in self.fxn_for_op: ret = self.from_underlying(self.fxn_for_op[UnaryOps.CAST](self.to_underlying(ret), (output.dtype, False))) # Do manual casting of ret if it does not match the required output dtype.
@@ -147,17 +162,18 @@ class FlopCounter:
     self.flops, ret = 0, self.flops
     return ret
 shape_fxn_for_op: Dict[Op, Callable] = {
+  LoadOps.BUFFER: lambda arg: (arg.views[-1].shape, arg.dtype, 0), LoadOps.CONST: lambda arg: (arg.views[-1].shape, arg.dtype, 0),
   UnaryOps.CAST: lambda self,arg: (self.shape, arg[0], self.consume_flops()),   # cast uses no flops
   **{op:lambda self: (self.shape, self.dtype, self.consume_flops() + prod(self.shape)) for op in UnaryOps if op != UnaryOps.CAST},
   **{op:lambda self,y: (self.shape, max(self.dtype, y.dtype), self.consume_flops() + y.consume_flops() + prod(self.shape)) for op in BinaryOps},
   **{op:lambda self,new_shape: (new_shape, self.dtype, self.consume_flops() + prod(self.shape)) for op in ReduceOps},
   TernaryOps.WHERE: lambda self,y,z: (self.shape, self.dtype, self.consume_flops() + y.consume_flops() + z.consume_flops() + prod(self.shape))}
-InterpretedFlopCounter = Interpreted(FlopCounter, shape_fxn_for_op, lambda x: FlopCounter((x.shape, x.dtype, 0)), lambda x: x)
+InterpretedFlopCounter = Interpreted(FlopCounter, shape_fxn_for_op, lambda x: x)
 def get_lazyop_info(ast:LazyOp) -> FlopCounter: return InterpretedFlopCounter.exec_ast(ast)
 
 # **************** for Compiled Buffers ****************
 
-from tinygrad.runtime.lib import RawBuffer, RawConst, buf_is_kernel_arg
+from tinygrad.runtime.lib import RawBuffer, RawConst
 from tinygrad.shape.symbolic import Variable, sym_infer
 
 class ASTRunner:
@@ -169,9 +185,8 @@ class ASTRunner:
     self.clprg = runtime(self.name, self.prg, **self.runtime_args)
     return self
 
-  def exec(self, bufs, var_vals:Optional[Dict[Variable, int]]=None, force_wait=False, optimizing=False) -> Optional[float]:
+  def exec(self, rawbufs, var_vals:Optional[Dict[Variable, int]]=None, force_wait=False, optimizing=False) -> Optional[float]:
     from tinygrad.jit import CacheCollector
-    rawbufs = dedup([x.realized for x in bufs if buf_is_kernel_arg(x)])
     if not optimizing: CacheCollector.add(self, rawbufs, var_vals if var_vals is not None else {})
     return self(rawbufs, var_vals, force_wait=force_wait)
 
@@ -205,33 +220,38 @@ class Compiled:
                      op_estimate=k.info.flops, mem_estimate=k.mem_estimate,
                      display_name=k.display_name, runtime_args={"binary": False}).build(self.runtime)
 
-  def exec_ast(self, ast:LazyOp, output, **kwargs):
+  def exec_ast(self, ast:LazyOp, output, inputs, var_vals, **kwargs):
+    #if DEBUG >= 4:
+    #  from extra.utils import print_tree
+    #  print_tree(ast)
+
     # check if we can reuse the output buffer
     # if it's aliased, don't use it
     # NOTE: this is pretty wrong actually, who knows where else this buffer is used?
     output.realized = output.output_buffer
     if output.realized:
       if output.realized.__class__ is RawConst: output.realized = None  # can't assign to RawConst
-      for a in ast.buffers:
-        if a.realized == output.realized and not a.st.contiguous:
-          output.realized = None
-          break
+      for i,a in enumerate(inputs):
+        # TODO: if this is contiguous it's fine
+        if a == output.realized:
+          views = [x.arg.views for x in ast.get_lazyops() if x.op == LoadOps.BUFFER and x.arg.idx == i+1]
+          if any(len(v) > 1 or not v[0].contiguous for v in views):
+            output.realized = None
+            break
 
     # we don't have an output buffer, we have to create it, and create to max size if it has symbolic shape
     if not output.realized: output.realized = self.buffer(prod((s if isinstance(s, int) else s.max for s in output.shape)), output.dtype, **kwargs)
     else:
       from tinygrad.jit import CacheCollector
       CacheCollector._mark_output_buffer(output.output_buffer)
-    # update the output var_vals from src
-    output.var_vals = dict(sorted(merge_dicts([buf.var_vals for buf in ast.buffers]).items(), key=lambda kv:cast(Variable,kv[0]).key))
 
     from tinygrad.codegen.linearizer import Linearizer
-    k = Linearizer(ast, output, self.linearizer_opts)
+    k = Linearizer(ast, self.linearizer_opts, var_vals)
 
     # compilation time
     def get_program():
       from tinygrad.codegen.search import kernel_optimize
-      if getenv("KOPT"): kernel_optimize(k, lambda: Linearizer(ast, output, self.linearizer_opts), self.to_program)
+      if getenv("KOPT"): kernel_optimize(k, lambda: Linearizer(ast, self.linearizer_opts, var_vals), self.to_program, [output.realized]+inputs)
       elif not getenv("NOOPT"): k.hand_coded_optimizations()
       return self.to_program(k)
 
@@ -243,5 +263,5 @@ class Compiled:
 
     if prg.name == getenv("PRINT_PRG", ''): print(prg.prg)
 
-    prg.exec(k.bufs, var_vals=output.var_vals)
+    prg.exec([output.realized]+inputs, var_vals=var_vals)
     return output.realized
diff --git a/tinygrad/runtime/lib.py b/tinygrad/runtime/lib.py
index 233c4a7856..88bf32b212 100644
--- a/tinygrad/runtime/lib.py
+++ b/tinygrad/runtime/lib.py
@@ -17,7 +17,7 @@ class RawBuffer:  # pylint: disable=abstract-method
   def __del__(self):  # NOTE: if it fails on init (bad dtype), it won't have a _memsz
     if hasattr(self, '_memsz'): GlobalCounters.mem_used -= self._memsz
     if hasattr(self, '_allocator') and self._allocator: self._allocator.free(self._buf)
-  def __repr__(self): return f"buffer<{self.size}, {self.dtype}>"
+  def __repr__(self): return f"buffer<{self.size}, {self.dtype}, {id(self)}>"
   @property
   def key(self): return (self.size, self.dtype)
 
diff --git a/tinygrad/runtime/ops_hip.py b/tinygrad/runtime/ops_hip.py
index 9d02a86974..9382df90a2 100644
--- a/tinygrad/runtime/ops_hip.py
+++ b/tinygrad/runtime/ops_hip.py
@@ -93,4 +93,4 @@ __device__ void vstore_half4(float4 data, size_t offset, half *p) { *(p + offset
   """,
   gid = [f'blockIdx.{chr(120+i)}' for i in range(3)],
   lid = [f'threadIdx.{chr(120+i)}' for i in range(3)]))
-HIPBuffer = Compiled(RawHIPBuffer, LinearizerOptions(), renderer, HIPProgram, hip.hipDeviceSynchronize)
+HIPBuffer = Compiled(RawHIPBuffer, LinearizerOptions(device="HIP"), renderer, HIPProgram, hip.hipDeviceSynchronize)
diff --git a/tinygrad/runtime/ops_metal.py b/tinygrad/runtime/ops_metal.py
index efc9e56d9f..86a0282e12 100644
--- a/tinygrad/runtime/ops_metal.py
+++ b/tinygrad/runtime/ops_metal.py
@@ -85,4 +85,4 @@ renderer = functools.partial(uops_to_cstyle, CStyleLanguage(
   barrier = "threadgroup_barrier(mem_flags::mem_threadgroup);", float4 = "float4", uses_ptr_arithmetic=True,
   gid = [f"gid.{chr(120+i)}" for i in range(3)], lid = [f"lid.{chr(120+i)}" for i in range(3)],
   extra_args = ['uint3 gid [[threadgroup_position_in_grid]]', 'uint3 lid [[thread_position_in_threadgroup]]']))
-MetalBuffer = Compiled(RawMetalBuffer, LinearizerOptions(), renderer, MetalProgram, METAL.synchronize)
+MetalBuffer = Compiled(RawMetalBuffer, LinearizerOptions(device="METAL"), renderer, MetalProgram, METAL.synchronize)
diff --git a/tinygrad/shape/view.py b/tinygrad/shape/view.py
index c13be65bfa..1e6fb31036 100644
--- a/tinygrad/shape/view.py
+++ b/tinygrad/shape/view.py
@@ -2,7 +2,7 @@ from __future__ import annotations
 import functools
 from typing import Tuple, List, Optional, NamedTuple
 from tinygrad.helpers import prod, all_int
-from tinygrad.shape.symbolic import NumNode, is_sym_int, sint
+from tinygrad.shape.symbolic import Node, NumNode, is_sym_int, sint
 
 @functools.lru_cache(maxsize=None)
 def filter_strides(shape:Tuple[int, ...], strides:Tuple[int, ...]) -> Tuple[int, ...]:
@@ -29,7 +29,7 @@ class View(NamedTuple):
     return View(shape, strides, offset, mask, contiguous)
 
   @functools.lru_cache(maxsize=None)  # pylint: disable=method-cache-max-size-none
-  def size(self): return prod([s for s,st in zip(self.shape, self.strides) if st != 0])
+  def size(self): return prod([s.max if isinstance(s, Node) else s for s,st in zip(self.shape, self.strides) if st != 0])
 
   # MovementOps live here now