diff --git a/tinygrad/llops/opencl.py b/tinygrad/llops/opencl.py index 34101a3755..c635be915a 100644 --- a/tinygrad/llops/opencl.py +++ b/tinygrad/llops/opencl.py @@ -272,6 +272,10 @@ def convdx(w,grad_output,dx,conv_args): int g = get_global_id(1); int ci = get_global_id(2); + for (int Y = 0; Y < iy; Y++) { for (int X = 0; X < ix; X++) { + dx[B*groups*cin*iy*ix + g*cin*iy*ix + ci*iy*ix + Y*ix + X] = 0.0; + } } + for (int Y = 0; Y < oy; Y++) { for (int X = 0; X < ox; X++) { for (int y = 0; y < H; y++) { for (int x = 0; x < W; x++) { float acc = 0.0; diff --git a/tinygrad/ops/ops_gpu.py b/tinygrad/ops/ops_gpu.py index 8b5c7bf583..ca1b0b8931 100644 --- a/tinygrad/ops/ops_gpu.py +++ b/tinygrad/ops/ops_gpu.py @@ -1,24 +1,20 @@ -import pyopencl as cl import numpy as np from tinygrad.helpers import binary_broadcast from ..tensor import Function -from ..llops.opencl import GPUBuffer +from ..llops.opencl import GPUBuffer as Buffer from ..llops.opencl import unary_op, binary_op, reduce_op, perm_axis, inner_slice from ..llops.opencl import matmul, conv, convdw, convdx -def buffer_new(shape, zero=False): - return GPUBuffer(shape, hostbuf=None if not zero else np.zeros(shape, dtype=np.float32)) - # ************* unary ops ************* class UnaryOp(Function): def forward(ctx, input): ctx.save_for_backward(input) - return unary_op(ctx.fop, input, buffer_new(input.shape)) + return unary_op(ctx.fop, input, Buffer(input.shape)) def backward(ctx, grad_output): input, = ctx.saved_tensors - return binary_op(ctx.bop, input, grad_output, buffer_new(input.shape)) + return binary_op(ctx.bop, input, grad_output, Buffer(input.shape)) class ReLU(UnaryOp): fop = 'max(a, (float)0.)' @@ -42,36 +38,36 @@ def reduce_shape(shape, axis): class Sum(Function): def forward(ctx, input, axis=None): ctx.save_for_backward(input.shape) - return reduce_op("out += a", input, buffer_new(reduce_shape(input.shape, axis))) + return reduce_op("out += a", input, Buffer(reduce_shape(input.shape, axis))) def backward(ctx, grad_output): shape_input, = ctx.saved_tensors - # NOTE: the b buffer_new isn't used, since this is just for broadcast - ret = buffer_new(shape_input) + # NOTE: the b Buffer isn't used, since this is just for broadcast + ret = Buffer(shape_input) return binary_op('a', grad_output, ret, ret) class Max(Function): def forward(ctx, input, axis=None): - ret = reduce_op("out = max(a,out)", input, buffer_new(reduce_shape(input.shape, axis)), start="-INFINITY") + ret = reduce_op("out = max(a,out)", input, Buffer(reduce_shape(input.shape, axis)), start="-INFINITY") ctx.save_for_backward(input, axis, ret) return ret def backward(ctx, grad_output): input, axis, ret = ctx.saved_tensors - ret2 = binary_op("1.0*(a==b)", input, ret, buffer_new(input.shape)) - div = reduce_op("out += a", ret2, buffer_new(reduce_shape(ret2.shape, axis)), start="1e-10") + ret2 = binary_op("1.0*(a==b)", input, ret, Buffer(input.shape)) + div = reduce_op("out += a", ret2, Buffer(reduce_shape(ret2.shape, axis)), start="1e-10") binary_op("a/b", ret2, div, ret2) return binary_op('a*b', ret2, grad_output, ret2) # ************* binary ops ************* def unbroadcast(out, in_sh): - return reduce_op("out += a", out, buffer_new(in_sh)) + return reduce_op("out += a", out, Buffer(in_sh)) class Add(Function): def forward(ctx, x, y): ctx.save_for_backward(x.shape, y.shape) - return binary_op('a+b', x, y, buffer_new(binary_broadcast(x.shape, y.shape))) + return binary_op('a+b', x, y, Buffer(binary_broadcast(x.shape, y.shape))) def backward(ctx, grad_output): shape_x, shape_y = ctx.saved_tensors @@ -80,33 +76,33 @@ class Add(Function): class Sub(Function): def forward(ctx, x, y): ctx.save_for_backward(x.shape, y.shape) - return binary_op('a-b', x, y, buffer_new(binary_broadcast(x.shape, y.shape))) + return binary_op('a-b', x, y, Buffer(binary_broadcast(x.shape, y.shape))) def backward(ctx, grad_output): shape_x, shape_y = ctx.saved_tensors - grad_x, grad_y = grad_output, unary_op('-a', grad_output, buffer_new(grad_output.shape)) + grad_x, grad_y = grad_output, unary_op('-a', grad_output, Buffer(grad_output.shape)) return unbroadcast(grad_x, shape_x), unbroadcast(grad_y, shape_y) class Mul(Function): def forward(ctx, x, y): ctx.save_for_backward(x, y) - return binary_op('a*b', x, y, buffer_new(binary_broadcast(x.shape, y.shape))) + return binary_op('a*b', x, y, Buffer(binary_broadcast(x.shape, y.shape))) def backward(ctx, grad_output): x,y = ctx.saved_tensors - grad_x = binary_op('a*b', y, grad_output, buffer_new(grad_output.shape)) - grad_y = binary_op('a*b', x, grad_output, buffer_new(grad_output.shape)) + grad_x = binary_op('a*b', y, grad_output, Buffer(grad_output.shape)) + grad_y = binary_op('a*b', x, grad_output, Buffer(grad_output.shape)) return unbroadcast(grad_x, x.shape), unbroadcast(grad_y, y.shape) class Pow(Function): def forward(ctx, x, y): ctx.save_for_backward(x, y) - return binary_op('pow(a,b)', x, y, buffer_new(binary_broadcast(x.shape, y.shape))) + return binary_op('pow(a,b)', x, y, Buffer(binary_broadcast(x.shape, y.shape))) def backward(ctx, grad_output): x,y = ctx.saved_tensors - grad_x_inter = binary_op('b * (pow((float)a, (float)(b-1.0)))', x, y, buffer_new(grad_output.shape)) - grad_y_inter = binary_op('pow(a, (float)b) * log(a);', x, y, buffer_new(grad_output.shape)) + grad_x_inter = binary_op('b * (pow((float)a, (float)(b-1.0)))', x, y, Buffer(grad_output.shape)) + grad_y_inter = binary_op('pow(a, (float)b) * log(a);', x, y, Buffer(grad_output.shape)) return unbroadcast(binary_op('a*b', grad_output, grad_x_inter, grad_x_inter), x.shape), \ unbroadcast(binary_op('a*b', grad_output, grad_y_inter, grad_y_inter), y.shape) @@ -116,35 +112,35 @@ class Reshape(Function): def forward(ctx, x, shape): ctx.save_for_backward(x.shape) shape = tuple(-np.prod(x.shape) // np.prod(shape) if s == -1 else s for s in shape) - r = GPUBuffer(shape, hostbuf=x) # NOTE: this is not a copy + r = Buffer(shape, hostbuf=x) # NOTE: this is not a copy assert np.prod(x.shape) == np.prod(r.shape) return r def backward(ctx, grad_output): in_shape, = ctx.saved_tensors - return GPUBuffer(in_shape, hostbuf=grad_output) + return Buffer(in_shape, hostbuf=grad_output) class Transpose(Function): def forward(ctx, x, order=(1,0)): ctx.save_for_backward(order) - ret = buffer_new(np.array(x.shape)[list(order)]) + ret = Buffer(np.array(x.shape)[list(order)]) return perm_axis(x, order, ret) def backward(ctx, grad_output): norder = np.argsort(ctx.order) - ret = buffer_new(np.array(grad_output.shape)[list(norder)]) + ret = Buffer(np.array(grad_output.shape)[list(norder)]) return perm_axis(grad_output, norder, ret) class Slice(Function): def forward(ctx, x, arg=None): ctx.save_for_backward(x.shape) - ret = buffer_new([y[1]-y[0] for y in arg]) + ret = Buffer([y[1]-y[0] for y in arg]) return inner_slice(x, arg, ret) def backward(ctx, grad_output): shape, = ctx.saved_tensors narg = [(0-p[0], grad_output.shape[i]+(shape[i]-p[1])) for i,p in enumerate(ctx.arg)] - ret = buffer_new([y[1]-y[0] for y in narg]) + ret = Buffer([y[1]-y[0] for y in narg]) return inner_slice(grad_output, narg, ret) # ************* processing ops ************* @@ -152,14 +148,14 @@ class Slice(Function): class Matmul(Function): def forward(ctx, input, weight): assert input.shape[-1] == weight.shape[-2] - ret = buffer_new(list(input.shape[0:-1])+[weight.shape[-1]]) + ret = Buffer(list(input.shape[0:-1])+[weight.shape[-1]]) ctx.save_for_backward(input, weight) return matmul(input, weight, ret) def backward(ctx, grad_output): input, weight = ctx.saved_tensors - grad_input = matmul(grad_output, weight, buffer_new(input.shape), transpose_b=True) if ctx.needs_input_grad[0] else None - grad_weight = matmul(input, grad_output, buffer_new(weight.shape), transpose_a=True) if ctx.needs_input_grad[1] else None + grad_input = matmul(grad_output, weight, Buffer(input.shape), transpose_b=True) if ctx.needs_input_grad[0] else None + grad_weight = matmul(input, grad_output, Buffer(weight.shape), transpose_a=True) if ctx.needs_input_grad[1] else None return grad_input, grad_weight class Conv2D(Function): @@ -177,7 +173,7 @@ class Conv2D(Function): # output buffer conv_args = H, W, groups, rcout, cin, oy, ox, iy, ix, ys, xs, bs - return conv(x, w, buffer_new((bs, cout, oy, ox)), conv_args) + return conv(x, w, Buffer((bs, cout, oy, ox)), conv_args) def backward(ctx, grad_output): bs,_,oy,ox = grad_output.shape @@ -191,6 +187,6 @@ class Conv2D(Function): rcout = cout//ctx.groups conv_args = H, W, ctx.groups, rcout, cin, oy, ox, iy, ix, ys, xs, bs - dx = convdx(w, grad_output, buffer_new((bs, cin_, iy, ix), zero=True), conv_args) if ctx.needs_input_grad[0] else None - dw = convdw(x, grad_output, buffer_new((cout, cin, H, W)), conv_args) if ctx.needs_input_grad[1] else None + dx = convdx(w, grad_output, Buffer((bs, cin_, iy, ix)), conv_args) if ctx.needs_input_grad[0] else None + dw = convdw(x, grad_output, Buffer((cout, cin, H, W)), conv_args) if ctx.needs_input_grad[1] else None return dx, dw