diff --git a/test/test_llops.py b/test/test_llops.py index b7fbb6f6ad..2606ea8bc9 100644 --- a/test/test_llops.py +++ b/test/test_llops.py @@ -6,7 +6,7 @@ import numpy as np from tinygrad.tensor import Device from tinygrad.helpers import UnaryOps, BinaryOps, ReduceOps if Device.DEFAULT == Device.GPU: - from tinygrad.llops.opencl import GPUBuffer, sync, unary_op, binary_op, reduce_op + from tinygrad.llops.ops_gpu import GPUBuffer, sync, unary_op, binary_op, reduce_op def timeit(fxn, its=1000, done=lambda:None): fxn() diff --git a/tinygrad/llops/ops_cpu.py b/tinygrad/llops/ops_cpu.py index 28a713b16f..aee0a8a587 100644 --- a/tinygrad/llops/ops_cpu.py +++ b/tinygrad/llops/ops_cpu.py @@ -1,7 +1,7 @@ import numpy as np from tinygrad.helpers import UnaryOps, BinaryOps, ReduceOps -class Buffer(np.ndarray): +class CPUBuffer(np.ndarray): def toCPU(x): return x def log(x): return np.log(x) def exp(x): return np.exp(x) diff --git a/tinygrad/llops/ops_gpu.py b/tinygrad/llops/ops_gpu.py index d4c5d3fff0..3be6e38ef5 100644 --- a/tinygrad/llops/ops_gpu.py +++ b/tinygrad/llops/ops_gpu.py @@ -17,20 +17,20 @@ i32 = np.int32 def roundup(x, n=4): return (x+(n-1))//n * n def sync(): cl_queue.finish() -class Buffer: +class GPUBuffer: def __init__(self, shape, hostbuf=None): require_init_gpu() self.shape, self.dtype = tuple(shape), np.float32 - self.cl = hostbuf.cl if isinstance(hostbuf, Buffer) else cl.Buffer(cl_ctx, cl.mem_flags.READ_WRITE, 4*roundup(np.prod(shape))) # padding - if hostbuf is not None and not isinstance(hostbuf, Buffer): + self.cl = hostbuf.cl if isinstance(hostbuf, GPUBuffer) else cl.Buffer(cl_ctx, cl.mem_flags.READ_WRITE, 4*roundup(np.prod(shape))) # padding + if hostbuf is not None and not isinstance(hostbuf, GPUBuffer): cl.enqueue_copy(cl_queue, self.cl, hostbuf.astype(np.float32).ravel()) def __repr__(self): - return f"" + return f"" @staticmethod def fromCPU(x): - return Buffer(x.shape, x.view(np.ndarray)) + return GPUBuffer(x.shape, x.view(np.ndarray)) def toCPU(self): data = np.empty(self.shape, dtype=np.float32) @@ -145,7 +145,7 @@ def reduce_op(op, inp, ret): def reshape(x, shape): assert np.prod(x.shape) == np.prod(shape) - return Buffer(shape, hostbuf=x) + return GPUBuffer(shape, hostbuf=x) def perm_axis(inp, order, ret): perm = clbuild("perm", """ diff --git a/tinygrad/llops/ops_torch.py b/tinygrad/llops/ops_torch.py index fc8810b595..d6e12de07d 100644 --- a/tinygrad/llops/ops_torch.py +++ b/tinygrad/llops/ops_torch.py @@ -2,16 +2,16 @@ import torch import numpy as np device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") -class Buffer(torch.Tensor): +class TorchBuffer(torch.Tensor): def __new__(cls, shape): if isinstance(shape, torch.Tensor): return super().__new__(cls, shape) else: - return Buffer(torch.zeros(shape)) + return TorchBuffer(torch.zeros(shape)) custompad = lambda x,padding: torch.nn.functional.pad(x, [item for sublist in padding[::-1] for item in sublist]) @staticmethod def fromCPU(data): - return Buffer(torch.from_numpy(data).requires_grad_(False)).to(device) + return TorchBuffer(torch.from_numpy(data).requires_grad_(False)).to(device) def toCPU(x): return x.cpu().numpy() def getdtype(self): diff --git a/tinygrad/mlops.py b/tinygrad/mlops.py index 05c674d873..f735b5660f 100644 --- a/tinygrad/mlops.py +++ b/tinygrad/mlops.py @@ -14,18 +14,18 @@ def select_llops(ops): class _UnaryOp(Function): def forward(ctx, input): ctx.save_for_backward(input) - return ctx.op.unary_op(ctx.fop, input, ctx.op.Buffer(input.shape)) + return ctx.op.unary_op(ctx.fop, input, ctx.buffer(input.shape)) def backward(ctx, grad_output): input, = ctx.saved_tensors - return ctx.op.binary_op(ctx.bop, input, grad_output, ctx.op.Buffer(input.shape)) + return ctx.op.binary_op(ctx.bop, input, grad_output, ctx.buffer(input.shape)) class ReLU(_UnaryOp): fop = UnaryOps.RELU def backward(ctx, grad_output): input, = ctx.saved_tensors - ret = ctx.op.Buffer(input.shape) + ret = ctx.buffer(input.shape) ctx.op.unary_op(UnaryOps.SIGN, input, ret) ctx.op.unary_op(UnaryOps.RELU, ret, ret) return ctx.op.binary_op(BinaryOps.MUL, ret, grad_output, ret) @@ -36,7 +36,7 @@ class Log(_UnaryOp): class Exp(_UnaryOp): def forward(ctx, input): - ret = ctx.op.unary_op(UnaryOps.EXP, input, ctx.op.Buffer(input.shape)) + ret = ctx.op.unary_op(UnaryOps.EXP, input, ctx.buffer(input.shape)) ctx.save_for_backward(ret) # we save the output here, not the input return ret @@ -50,37 +50,37 @@ def reduce_shape(shape, axis): class Sum(Function): def forward(ctx, input, axis=None): ctx.save_for_backward(input.shape) - return ctx.op.reduce_op(ReduceOps.SUM, input, ctx.op.Buffer(reduce_shape(input.shape, axis))) + return ctx.op.reduce_op(ReduceOps.SUM, input, ctx.buffer(reduce_shape(input.shape, axis))) def backward(ctx, grad_output): shape_input, = ctx.saved_tensors # NOTE: the b Buffer isn't used, since this is just for broadcast - ret = ctx.op.Buffer(shape_input) + ret = ctx.buffer(shape_input) return ctx.op.binary_op(BinaryOps.A, grad_output, ret, ret) class Max(Function): def forward(ctx, input, axis=None): - ret = ctx.op.reduce_op(ReduceOps.MAX, input, ctx.op.Buffer(reduce_shape(input.shape, axis))) + ret = ctx.op.reduce_op(ReduceOps.MAX, input, ctx.buffer(reduce_shape(input.shape, axis))) ctx.save_for_backward(input, ret) return ret def backward(ctx, grad_output): input, ret = ctx.saved_tensors - ret2 = ctx.op.binary_op(BinaryOps.CMPEQ, input, ret, ctx.op.Buffer(input.shape)) - div = ctx.op.reduce_op(ReduceOps.SUM, ret2, ctx.op.Buffer(grad_output.shape)) + ret2 = ctx.op.binary_op(BinaryOps.CMPEQ, input, ret, ctx.buffer(input.shape)) + div = ctx.op.reduce_op(ReduceOps.SUM, ret2, ctx.buffer(grad_output.shape)) ctx.op.binary_op(BinaryOps.DIV, div, ret2, ret2) return ctx.op.binary_op(BinaryOps.MUL, ret2, grad_output, ret2) # ************* binary ops ************* def unbroadcast(ctx, out, in_sh): - return ctx.op.reduce_op(ReduceOps.SUM, out, ctx.op.Buffer(in_sh)) + return ctx.op.reduce_op(ReduceOps.SUM, out, ctx.buffer(in_sh)) class Add(Function): def forward(ctx, x, y): ctx.save_for_backward(x.shape, y.shape) - buf = ctx.op.Buffer(binary_broadcast(x.shape, y.shape)) - return ctx.op.binary_op(BinaryOps.ADD, x, y, buf) #ctx.op.Buffer(binary_broadcast(x.shape, y.shape))) + buf = ctx.buffer(binary_broadcast(x.shape, y.shape)) + return ctx.op.binary_op(BinaryOps.ADD, x, y, buf) #ctx.buffer(binary_broadcast(x.shape, y.shape))) def backward(ctx, grad_output): shape_x, shape_y = ctx.saved_tensors @@ -90,39 +90,39 @@ class Add(Function): class Sub(Function): def forward(ctx, x, y): ctx.save_for_backward(x.shape, y.shape) - return ctx.op.binary_op(BinaryOps.SUB, x, y, ctx.op.Buffer(binary_broadcast(x.shape, y.shape))) + return ctx.op.binary_op(BinaryOps.SUB, x, y, ctx.buffer(binary_broadcast(x.shape, y.shape))) def backward(ctx, grad_output): shape_x, shape_y = ctx.saved_tensors - neg_grad_output = ctx.op.unary_op(UnaryOps.NEG, grad_output, ctx.op.Buffer(grad_output.shape)) + neg_grad_output = ctx.op.unary_op(UnaryOps.NEG, grad_output, ctx.buffer(grad_output.shape)) return unbroadcast(ctx, grad_output, shape_x) if ctx.needs_input_grad[0] else None, \ unbroadcast(ctx, neg_grad_output, shape_y) if ctx.needs_input_grad[1] else None class Mul(Function): def forward(ctx, x, y): ctx.save_for_backward(x, y) - return ctx.op.binary_op(BinaryOps.MUL, x, y, ctx.op.Buffer(binary_broadcast(x.shape, y.shape))) + return ctx.op.binary_op(BinaryOps.MUL, x, y, ctx.buffer(binary_broadcast(x.shape, y.shape))) def backward(ctx, grad_output): x,y = ctx.saved_tensors - tmp = ctx.op.Buffer(grad_output.shape) + tmp = ctx.buffer(grad_output.shape) grad_x = unbroadcast(ctx, ctx.op.binary_op(BinaryOps.MUL, y, grad_output, tmp), x.shape) if ctx.needs_input_grad[0] else None grad_y = unbroadcast(ctx, ctx.op.binary_op(BinaryOps.MUL, x, grad_output, tmp), y.shape) if ctx.needs_input_grad[1] else None return grad_x, grad_y class Pow(Function): def forward(ctx, x, y): - ret = ctx.op.Buffer(binary_broadcast(x.shape, y.shape)) + ret = ctx.buffer(binary_broadcast(x.shape, y.shape)) ctx.save_for_backward(x, y, ret) return ctx.op.binary_op(BinaryOps.POW, x, y, ret) def backward(ctx, grad_output): x,y,powxy = ctx.saved_tensors - tmp = ctx.op.Buffer(grad_output.shape) + tmp = ctx.buffer(grad_output.shape) ctx.op.binary_op(BinaryOps.DIV, x, powxy, tmp) # pow(x,y)/x ctx.op.binary_op(BinaryOps.MUL, y, tmp, tmp) # y * pow(x,y)/x grad_x = unbroadcast(ctx, ctx.op.binary_op(BinaryOps.MUL, grad_output, tmp, tmp), x.shape) if ctx.needs_input_grad[0] else None - log_x = ctx.op.unary_op(UnaryOps.LOG, x, ctx.op.Buffer(x.shape)) + log_x = ctx.op.unary_op(UnaryOps.LOG, x, ctx.buffer(x.shape)) ctx.op.binary_op(BinaryOps.MUL, log_x, powxy, tmp) # log(x) * pow(x,y) grad_y = unbroadcast(ctx, ctx.op.binary_op(BinaryOps.MUL, grad_output, tmp, tmp), y.shape) if ctx.needs_input_grad[1] else None return grad_x, grad_y @@ -142,24 +142,24 @@ class Reshape(Function): class Transpose(Function): def forward(ctx, x, order=(1,0)): ctx.save_for_backward(order) - ret = ctx.op.Buffer([x.shape[i] for i in order]) + ret = ctx.buffer([x.shape[i] for i in order]) return ctx.op.perm_axis(x, order, ret) def backward(ctx, grad_output): norder = np.argsort(ctx.order).tolist() - ret = ctx.op.Buffer([grad_output.shape[i] for i in norder]) + ret = ctx.buffer([grad_output.shape[i] for i in norder]) return ctx.op.perm_axis(grad_output, norder, ret) class Slice(Function): def forward(ctx, x, arg=None): ctx.save_for_backward(x.shape) - ret = ctx.op.Buffer([y[1]-y[0] for y in arg]) + ret = ctx.buffer([y[1]-y[0] for y in arg]) return ctx.op.inner_slice(x, arg, ret) def backward(ctx, grad_output): shape, = ctx.saved_tensors narg = [(0-p[0], grad_output.shape[i]+(shape[i]-p[1])) for i,p in enumerate(ctx.arg)] - ret = ctx.op.Buffer([y[1]-y[0] for y in narg]) + ret = ctx.buffer([y[1]-y[0] for y in narg]) return ctx.op.inner_slice(grad_output, narg, ret) # ************* processing ops ************* @@ -167,14 +167,14 @@ class Slice(Function): class Matmul(Function): def forward(ctx, input, weight): assert input.shape[-1] == weight.shape[-2] - ret = ctx.op.Buffer(list(input.shape[0:-1])+[weight.shape[-1]]) + ret = ctx.buffer(list(input.shape[0:-1])+[weight.shape[-1]]) ctx.save_for_backward(input, weight) return ctx.op.matmul(input, weight, ret) def backward(ctx, grad_output): input, weight = ctx.saved_tensors - grad_input = ctx.op.matmul(grad_output, weight, ctx.op.Buffer(input.shape), transpose_b=True) if ctx.needs_input_grad[0] else None - grad_weight = ctx.op.matmul(input, grad_output, ctx.op.Buffer(weight.shape), transpose_a=True) if ctx.needs_input_grad[1] else None + grad_input = ctx.op.matmul(grad_output, weight, ctx.buffer(input.shape), transpose_b=True) if ctx.needs_input_grad[0] else None + grad_weight = ctx.op.matmul(input, grad_output, ctx.buffer(weight.shape), transpose_a=True) if ctx.needs_input_grad[1] else None return grad_input, grad_weight class Conv2D(Function): @@ -192,7 +192,7 @@ class Conv2D(Function): # output buffer conv_args = H, W, ctx.groups, rcout, cin, oy, ox, iy, ix, ys, xs, bs - return ctx.op.conv(x, w, ctx.op.Buffer((bs, cout, oy, ox)), conv_args) + return ctx.op.conv(x, w, ctx.buffer((bs, cout, oy, ox)), conv_args) def backward(ctx, grad_output): bs,_,oy,ox = grad_output.shape @@ -206,6 +206,6 @@ class Conv2D(Function): rcout = cout//ctx.groups conv_args = H, W, ctx.groups, rcout, cin, oy, ox, iy, ix, ys, xs, bs - dx = ctx.op.convdx(w, grad_output, ctx.op.Buffer((bs, cin_, iy, ix)), conv_args) if ctx.needs_input_grad[0] else None - dw = ctx.op.convdw(x, grad_output, ctx.op.Buffer((cout, cin, H, W)), conv_args) if ctx.needs_input_grad[1] else None + dx = ctx.op.convdx(w, grad_output, ctx.buffer((bs, cin_, iy, ix)), conv_args) if ctx.needs_input_grad[0] else None + dw = ctx.op.convdw(x, grad_output, ctx.buffer((cout, cin, H, W)), conv_args) if ctx.needs_input_grad[1] else None return dx, dw \ No newline at end of file diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py index f0f528cd1f..04d20bd76b 100644 --- a/tinygrad/tensor.py +++ b/tinygrad/tensor.py @@ -75,7 +75,7 @@ class Device: DEFAULT = i if os.environ.get(name, 0) == "1" else DEFAULT try: llops[i] = importlib.import_module('tinygrad.llops.'+op) - buffers[i] = llops[i].Buffer + buffers[i] = [cls for name, cls in inspect.getmembers(llops[i], inspect.isclass) if name.endswith("Buffer")][0] except ImportError as e: print(op, "not available", e) DEFAULT = CPU if DEFAULT is None else DEFAULT