diff --git a/test/test_llops.py b/test/test_llops.py
index b7fbb6f6ad..2606ea8bc9 100644
--- a/test/test_llops.py
+++ b/test/test_llops.py
@@ -6,7 +6,7 @@ import numpy as np
 from tinygrad.tensor import Device
 from tinygrad.helpers import UnaryOps, BinaryOps, ReduceOps
 if Device.DEFAULT == Device.GPU:
-  from tinygrad.llops.opencl import GPUBuffer, sync, unary_op, binary_op, reduce_op
+  from tinygrad.llops.ops_gpu import GPUBuffer, sync, unary_op, binary_op, reduce_op
 
 def timeit(fxn, its=1000, done=lambda:None):
   fxn()
diff --git a/tinygrad/llops/ops_cpu.py b/tinygrad/llops/ops_cpu.py
index 28a713b16f..aee0a8a587 100644
--- a/tinygrad/llops/ops_cpu.py
+++ b/tinygrad/llops/ops_cpu.py
@@ -1,7 +1,7 @@
 import numpy as np
 from tinygrad.helpers import UnaryOps, BinaryOps, ReduceOps
 
-class Buffer(np.ndarray):
+class CPUBuffer(np.ndarray):
   def toCPU(x): return x
   def log(x): return np.log(x)
   def exp(x): return np.exp(x)
diff --git a/tinygrad/llops/ops_gpu.py b/tinygrad/llops/ops_gpu.py
index d4c5d3fff0..3be6e38ef5 100644
--- a/tinygrad/llops/ops_gpu.py
+++ b/tinygrad/llops/ops_gpu.py
@@ -17,20 +17,20 @@ i32 = np.int32
 def roundup(x, n=4): return (x+(n-1))//n * n
 def sync(): cl_queue.finish()
 
-class Buffer:
+class GPUBuffer:
   def __init__(self, shape, hostbuf=None):
     require_init_gpu()
     self.shape, self.dtype = tuple(shape), np.float32
-    self.cl = hostbuf.cl if isinstance(hostbuf, Buffer) else cl.Buffer(cl_ctx, cl.mem_flags.READ_WRITE, 4*roundup(np.prod(shape)))  # padding
-    if hostbuf is not None and not isinstance(hostbuf, Buffer):
+    self.cl = hostbuf.cl if isinstance(hostbuf, GPUBuffer) else cl.Buffer(cl_ctx, cl.mem_flags.READ_WRITE, 4*roundup(np.prod(shape)))  # padding
+    if hostbuf is not None and not isinstance(hostbuf, GPUBuffer):
       cl.enqueue_copy(cl_queue, self.cl, hostbuf.astype(np.float32).ravel())
 
   def __repr__(self):
-    return f"<GPU Buffer with shape {self.shape!r}>"
+    return f"<GPUBuffer with shape {self.shape!r}>"
 
   @staticmethod
   def fromCPU(x):
-    return Buffer(x.shape, x.view(np.ndarray))
+    return GPUBuffer(x.shape, x.view(np.ndarray))
 
   def toCPU(self):
     data = np.empty(self.shape, dtype=np.float32)
@@ -145,7 +145,7 @@ def reduce_op(op, inp, ret):
 
 def reshape(x, shape):
   assert np.prod(x.shape) == np.prod(shape)
-  return Buffer(shape, hostbuf=x)
+  return GPUBuffer(shape, hostbuf=x)
 
 def perm_axis(inp, order, ret):
   perm = clbuild("perm", """
diff --git a/tinygrad/llops/ops_torch.py b/tinygrad/llops/ops_torch.py
index fc8810b595..d6e12de07d 100644
--- a/tinygrad/llops/ops_torch.py
+++ b/tinygrad/llops/ops_torch.py
@@ -2,16 +2,16 @@ import torch
 import numpy as np
 
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-class Buffer(torch.Tensor):
+class TorchBuffer(torch.Tensor):
   def __new__(cls, shape):
     if isinstance(shape, torch.Tensor):
       return super().__new__(cls, shape)
     else:
-      return Buffer(torch.zeros(shape))
+      return TorchBuffer(torch.zeros(shape))
   custompad = lambda x,padding: torch.nn.functional.pad(x, [item for sublist in padding[::-1] for item in sublist])
   @staticmethod
   def fromCPU(data):
-    return Buffer(torch.from_numpy(data).requires_grad_(False)).to(device)
+    return TorchBuffer(torch.from_numpy(data).requires_grad_(False)).to(device)
   def toCPU(x):
     return x.cpu().numpy()
   def getdtype(self):
diff --git a/tinygrad/mlops.py b/tinygrad/mlops.py
index 05c674d873..f735b5660f 100644
--- a/tinygrad/mlops.py
+++ b/tinygrad/mlops.py
@@ -14,18 +14,18 @@ def select_llops(ops):
 class _UnaryOp(Function):
   def forward(ctx, input):
     ctx.save_for_backward(input)
-    return ctx.op.unary_op(ctx.fop, input, ctx.op.Buffer(input.shape))
+    return ctx.op.unary_op(ctx.fop, input, ctx.buffer(input.shape))
 
   def backward(ctx, grad_output):
     input, = ctx.saved_tensors
-    return ctx.op.binary_op(ctx.bop, input, grad_output, ctx.op.Buffer(input.shape))
+    return ctx.op.binary_op(ctx.bop, input, grad_output, ctx.buffer(input.shape))
 
 class ReLU(_UnaryOp):
   fop = UnaryOps.RELU
 
   def backward(ctx, grad_output):
     input, = ctx.saved_tensors
-    ret = ctx.op.Buffer(input.shape)
+    ret = ctx.buffer(input.shape)
     ctx.op.unary_op(UnaryOps.SIGN, input, ret)
     ctx.op.unary_op(UnaryOps.RELU, ret, ret)
     return ctx.op.binary_op(BinaryOps.MUL, ret, grad_output, ret)
@@ -36,7 +36,7 @@ class Log(_UnaryOp):
 
 class Exp(_UnaryOp):
   def forward(ctx, input):
-    ret = ctx.op.unary_op(UnaryOps.EXP, input, ctx.op.Buffer(input.shape))
+    ret = ctx.op.unary_op(UnaryOps.EXP, input, ctx.buffer(input.shape))
     ctx.save_for_backward(ret)   # we save the output here, not the input
     return ret
 
@@ -50,37 +50,37 @@ def reduce_shape(shape, axis):
 class Sum(Function):
   def forward(ctx, input, axis=None):
     ctx.save_for_backward(input.shape)
-    return ctx.op.reduce_op(ReduceOps.SUM, input, ctx.op.Buffer(reduce_shape(input.shape, axis)))
+    return ctx.op.reduce_op(ReduceOps.SUM, input, ctx.buffer(reduce_shape(input.shape, axis)))
 
   def backward(ctx, grad_output):
     shape_input, = ctx.saved_tensors
     # NOTE: the b Buffer isn't used, since this is just for broadcast
-    ret = ctx.op.Buffer(shape_input)
+    ret = ctx.buffer(shape_input)
     return ctx.op.binary_op(BinaryOps.A, grad_output, ret, ret)
 
 class Max(Function):
   def forward(ctx, input, axis=None):
-    ret = ctx.op.reduce_op(ReduceOps.MAX, input, ctx.op.Buffer(reduce_shape(input.shape, axis)))
+    ret = ctx.op.reduce_op(ReduceOps.MAX, input, ctx.buffer(reduce_shape(input.shape, axis)))
     ctx.save_for_backward(input, ret)
     return ret
 
   def backward(ctx, grad_output):
     input, ret = ctx.saved_tensors
-    ret2 = ctx.op.binary_op(BinaryOps.CMPEQ, input, ret, ctx.op.Buffer(input.shape))
-    div = ctx.op.reduce_op(ReduceOps.SUM, ret2, ctx.op.Buffer(grad_output.shape))
+    ret2 = ctx.op.binary_op(BinaryOps.CMPEQ, input, ret, ctx.buffer(input.shape))
+    div = ctx.op.reduce_op(ReduceOps.SUM, ret2, ctx.buffer(grad_output.shape))
     ctx.op.binary_op(BinaryOps.DIV, div, ret2, ret2)
     return ctx.op.binary_op(BinaryOps.MUL, ret2, grad_output, ret2)
 
 # ************* binary ops *************
 
 def unbroadcast(ctx, out, in_sh):
-  return ctx.op.reduce_op(ReduceOps.SUM, out, ctx.op.Buffer(in_sh))
+  return ctx.op.reduce_op(ReduceOps.SUM, out, ctx.buffer(in_sh))
 
 class Add(Function):
   def forward(ctx, x, y):
     ctx.save_for_backward(x.shape, y.shape)
-    buf = ctx.op.Buffer(binary_broadcast(x.shape, y.shape))
-    return ctx.op.binary_op(BinaryOps.ADD, x, y, buf) #ctx.op.Buffer(binary_broadcast(x.shape, y.shape)))
+    buf = ctx.buffer(binary_broadcast(x.shape, y.shape))
+    return ctx.op.binary_op(BinaryOps.ADD, x, y, buf) #ctx.buffer(binary_broadcast(x.shape, y.shape)))
 
   def backward(ctx, grad_output):
     shape_x, shape_y = ctx.saved_tensors
@@ -90,39 +90,39 @@ class Add(Function):
 class Sub(Function):
   def forward(ctx, x, y):
     ctx.save_for_backward(x.shape, y.shape)
-    return ctx.op.binary_op(BinaryOps.SUB, x, y, ctx.op.Buffer(binary_broadcast(x.shape, y.shape)))
+    return ctx.op.binary_op(BinaryOps.SUB, x, y, ctx.buffer(binary_broadcast(x.shape, y.shape)))
 
   def backward(ctx, grad_output):
     shape_x, shape_y = ctx.saved_tensors
-    neg_grad_output = ctx.op.unary_op(UnaryOps.NEG, grad_output, ctx.op.Buffer(grad_output.shape))
+    neg_grad_output = ctx.op.unary_op(UnaryOps.NEG, grad_output, ctx.buffer(grad_output.shape))
     return unbroadcast(ctx, grad_output, shape_x) if ctx.needs_input_grad[0] else None, \
            unbroadcast(ctx, neg_grad_output, shape_y) if ctx.needs_input_grad[1] else None
 
 class Mul(Function):
   def forward(ctx, x, y):
     ctx.save_for_backward(x, y)
-    return ctx.op.binary_op(BinaryOps.MUL, x, y, ctx.op.Buffer(binary_broadcast(x.shape, y.shape)))
+    return ctx.op.binary_op(BinaryOps.MUL, x, y, ctx.buffer(binary_broadcast(x.shape, y.shape)))
 
   def backward(ctx, grad_output):
     x,y = ctx.saved_tensors
-    tmp = ctx.op.Buffer(grad_output.shape)
+    tmp = ctx.buffer(grad_output.shape)
     grad_x = unbroadcast(ctx, ctx.op.binary_op(BinaryOps.MUL, y, grad_output, tmp), x.shape) if ctx.needs_input_grad[0] else None
     grad_y = unbroadcast(ctx, ctx.op.binary_op(BinaryOps.MUL, x, grad_output, tmp), y.shape) if ctx.needs_input_grad[1] else None
     return grad_x, grad_y
 
 class Pow(Function):
   def forward(ctx, x, y):
-    ret = ctx.op.Buffer(binary_broadcast(x.shape, y.shape))
+    ret = ctx.buffer(binary_broadcast(x.shape, y.shape))
     ctx.save_for_backward(x, y, ret)
     return ctx.op.binary_op(BinaryOps.POW, x, y, ret)
 
   def backward(ctx, grad_output):
     x,y,powxy = ctx.saved_tensors
-    tmp = ctx.op.Buffer(grad_output.shape)
+    tmp = ctx.buffer(grad_output.shape)
     ctx.op.binary_op(BinaryOps.DIV, x, powxy, tmp)      # pow(x,y)/x
     ctx.op.binary_op(BinaryOps.MUL, y, tmp, tmp)        # y * pow(x,y)/x
     grad_x = unbroadcast(ctx, ctx.op.binary_op(BinaryOps.MUL, grad_output, tmp, tmp), x.shape) if ctx.needs_input_grad[0] else None
-    log_x = ctx.op.unary_op(UnaryOps.LOG, x, ctx.op.Buffer(x.shape))
+    log_x = ctx.op.unary_op(UnaryOps.LOG, x, ctx.buffer(x.shape))
     ctx.op.binary_op(BinaryOps.MUL, log_x, powxy, tmp)    # log(x) * pow(x,y)
     grad_y = unbroadcast(ctx, ctx.op.binary_op(BinaryOps.MUL, grad_output, tmp, tmp), y.shape) if ctx.needs_input_grad[1] else None
     return grad_x, grad_y
@@ -142,24 +142,24 @@ class Reshape(Function):
 class Transpose(Function):
   def forward(ctx, x, order=(1,0)):
     ctx.save_for_backward(order)
-    ret = ctx.op.Buffer([x.shape[i] for i in order])
+    ret = ctx.buffer([x.shape[i] for i in order])
     return ctx.op.perm_axis(x, order, ret)
 
   def backward(ctx, grad_output):
     norder = np.argsort(ctx.order).tolist()
-    ret = ctx.op.Buffer([grad_output.shape[i] for i in norder])
+    ret = ctx.buffer([grad_output.shape[i] for i in norder])
     return ctx.op.perm_axis(grad_output, norder, ret)
 
 class Slice(Function):
   def forward(ctx, x, arg=None):
     ctx.save_for_backward(x.shape)
-    ret = ctx.op.Buffer([y[1]-y[0] for y in arg])
+    ret = ctx.buffer([y[1]-y[0] for y in arg])
     return ctx.op.inner_slice(x, arg, ret)
 
   def backward(ctx, grad_output):
     shape, = ctx.saved_tensors
     narg = [(0-p[0], grad_output.shape[i]+(shape[i]-p[1])) for i,p in enumerate(ctx.arg)]
-    ret = ctx.op.Buffer([y[1]-y[0] for y in narg])
+    ret = ctx.buffer([y[1]-y[0] for y in narg])
     return ctx.op.inner_slice(grad_output, narg, ret)
 
 # ************* processing ops *************
@@ -167,14 +167,14 @@ class Slice(Function):
 class Matmul(Function):
   def forward(ctx, input, weight):
     assert input.shape[-1] == weight.shape[-2]
-    ret = ctx.op.Buffer(list(input.shape[0:-1])+[weight.shape[-1]])
+    ret = ctx.buffer(list(input.shape[0:-1])+[weight.shape[-1]])
     ctx.save_for_backward(input, weight)
     return ctx.op.matmul(input, weight, ret)
 
   def backward(ctx, grad_output):
     input, weight = ctx.saved_tensors
-    grad_input = ctx.op.matmul(grad_output, weight, ctx.op.Buffer(input.shape), transpose_b=True) if ctx.needs_input_grad[0] else None
-    grad_weight = ctx.op.matmul(input, grad_output, ctx.op.Buffer(weight.shape), transpose_a=True) if ctx.needs_input_grad[1] else None
+    grad_input = ctx.op.matmul(grad_output, weight, ctx.buffer(input.shape), transpose_b=True) if ctx.needs_input_grad[0] else None
+    grad_weight = ctx.op.matmul(input, grad_output, ctx.buffer(weight.shape), transpose_a=True) if ctx.needs_input_grad[1] else None
     return grad_input, grad_weight
 
 class Conv2D(Function):
@@ -192,7 +192,7 @@ class Conv2D(Function):
 
     # output buffer
     conv_args = H, W, ctx.groups, rcout, cin, oy, ox, iy, ix, ys, xs, bs
-    return ctx.op.conv(x, w, ctx.op.Buffer((bs, cout, oy, ox)), conv_args)
+    return ctx.op.conv(x, w, ctx.buffer((bs, cout, oy, ox)), conv_args)
 
   def backward(ctx, grad_output):
     bs,_,oy,ox = grad_output.shape
@@ -206,6 +206,6 @@ class Conv2D(Function):
     rcout = cout//ctx.groups
 
     conv_args = H, W, ctx.groups, rcout, cin, oy, ox, iy, ix, ys, xs, bs
-    dx = ctx.op.convdx(w, grad_output, ctx.op.Buffer((bs, cin_, iy, ix)), conv_args) if ctx.needs_input_grad[0] else None
-    dw = ctx.op.convdw(x, grad_output, ctx.op.Buffer((cout, cin, H, W)), conv_args) if ctx.needs_input_grad[1] else None
+    dx = ctx.op.convdx(w, grad_output, ctx.buffer((bs, cin_, iy, ix)), conv_args) if ctx.needs_input_grad[0] else None
+    dw = ctx.op.convdw(x, grad_output, ctx.buffer((cout, cin, H, W)), conv_args) if ctx.needs_input_grad[1] else None
     return dx, dw
\ No newline at end of file
diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py
index f0f528cd1f..04d20bd76b 100644
--- a/tinygrad/tensor.py
+++ b/tinygrad/tensor.py
@@ -75,7 +75,7 @@ class Device:
     DEFAULT = i if os.environ.get(name, 0) == "1" else DEFAULT
     try:
       llops[i] = importlib.import_module('tinygrad.llops.'+op)
-      buffers[i] = llops[i].Buffer
+      buffers[i] = [cls for name, cls in inspect.getmembers(llops[i], inspect.isclass) if name.endswith("Buffer")][0]
     except ImportError as e:
       print(op, "not available", e)
   DEFAULT = CPU if DEFAULT is None else DEFAULT