mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-06-11 23:46:02 +08:00
* models matrix * fix typo and install gpu deps * install llvm deps if needed * fix * testops with cuda * remove pip cache since not work * cuda env * install cuda deps * maybe it will work now * i can't read * all tests in matrix * trim down more * opencl stuff in matrix * opencl pip cache * test split * change cuda test exclusion * test * fix cuda maybe * add models * add more n=auto * third thing * fix bug * cache pip more * change name * update tests * try again cause why not * balance * try again... * try apt cache for cuda * try on gpu: * try cuda again * update packages step * replace libz-dev with zlib1g-dev * only cache cuda * why error * fix gpuocelot bug * apt cache err * apt cache to slow? * opt and image in single runner * add a couple n=autos * remove test matrix * try cuda apt cache again * libz-dev -> zlib1g-dev * remove -s since not supported by xdist * the cache takes too long and doesn't work * combine webgpu and metal tests * combine imagenet to c and cpu tests * torch tests with linters * torch back by itself * small windows clang test with torch tests * fix a goofy windows bug * im dumb * bro * clang with linters * fix pylint error * linter not work on windows * try with clang again * clang and imagenet? * install deps * fix * fix quote * clang by itself (windows too slow) * env vars for imagenet * cache pip for metal and webgpu tests * try torch with metal and webgpu * doesn't work, too long * remove -v * try -n=logical * don't use logical * revert accidental thing * remove some prints unless CI * fix print unless CI * ignore speed tests for slow tests * clang windows in matrix (ubuntu being tested in imagenet->c test) * try manual pip cache * fix windows pip cache path * all manual pip cache * fix pip cache dir for macos * print_ci function in helpers * CI as variable, no print_ci * missed one * cuda tests with docker image * remove setup-python action for cuda * python->python3? * remove -s -v * try fix pip cache * maybe fix * try to fix pip cache * is this the path? * maybe cache pip * try again * create wheels dir * ? * cuda pip deps in dockerfile * disable pip cache for clang * image from ghcr instead of docker hub * why is clang like this * fast deps * try use different caches * remove the fast thing * try with lighter image * remove setup python for cuda * small docker and cuda fast deps * ignore a few more tests * cool docker thing (maybe) * oops * quotes * fix docker command * fix bug * ignore train efficientnet test * remove dockerfile (docker stuff takes too long) * remove docker stuff and normal cuda * oops * ignore the tests for cuda * does this work * ignore test_train on slow backends * add space * llvm ignore same tests as cuda * nvm * ignore lr scheduler tests * get some stats * fix ignore bug * remove extra ' * remove and * ignore test for llvm * change ignored tests and durationon all backends * fix * and -> or * ignore some more cuda tests * finally? * does this fix it * remove durations=0 * add some more tests to llvm * make last pytest more readable * fix * don't train efficientnet on cpu * try w/out pip cache * pip cache seems to be generally better * pytest file markers * try apt fast for cuda * use quick install for apt-fast * apt-fast not worth * apt-get to apt * fix typo * suppress warnings * register markers * disable debug on fuzz tests * change marker names * apt update and apt install in one command * update marker names in test.yml * webgpu pytest marker
108 lines
4.9 KiB
Python
108 lines
4.9 KiB
Python
# this is an example of how you can write terrible DSP compute breaking ops like warpPerspective
|
|
# here we use a CUSTOM op to write atan2
|
|
|
|
import unittest
|
|
import numpy as np
|
|
from typing import Optional, Tuple
|
|
from tinygrad.helpers import prod, dtypes
|
|
|
|
# *** first, we implement the atan2 op at the lowest level ***
|
|
# `atan2_gpu` for GPUBuffers and `atan2_cpu` for CPUBuffers
|
|
from tinygrad.lazy import LazyBuffer, create_lazybuffer, Device
|
|
from tinygrad.ops import ASTRunner
|
|
from tinygrad.shape.shapetracker import ShapeTracker
|
|
import pytest
|
|
|
|
pytestmark = pytest.mark.webgpu
|
|
|
|
# we don't always have GPU support, so the type signature is the abstract CompiledBuffer instead of GPUBuffer
|
|
def atan2_gpu(ret:LazyBuffer, a:LazyBuffer, b:LazyBuffer):
|
|
assert a.device == "GPU" and b.device == "GPU", "gpu function requires GPUBuffers"
|
|
assert a.dtype == b.dtype and a.dtype == dtypes.float32, "gpu function only supports float32"
|
|
ret.realized = Device[ret.device].buffer(prod(ret.shape), ret.dtype)
|
|
ASTRunner("atan2_gpu", """
|
|
__kernel void atan2_gpu(global float *c, global float *a, global float *b) {
|
|
int idx = get_global_id(0);
|
|
c[idx] = atan2(a[idx], b[idx]);
|
|
}""", global_size=[prod(ret.shape)]).build(Device[ret.device].runtime).exec([ret, a, b])
|
|
return ret.realized
|
|
|
|
def atan2_cpu(ret:LazyBuffer, a:LazyBuffer, b:LazyBuffer):
|
|
return Device[ret.device].from_underlying(np.arctan2(a.realized._buf, b.realized._buf))
|
|
|
|
# *** second, we write the ATan2 mlop ***
|
|
# NOTE: The derivative of atan2 doesn't need a custom op! https://www.liquisearch.com/atan2/derivative
|
|
# In general, it is also optional to write a backward function, just your backward pass won't work without it
|
|
|
|
from tinygrad.ops import ASTRunner, LazyOp, LoadOps, BinaryOps, UnaryOps
|
|
from tinygrad.lazy import LazyBuffer
|
|
from tinygrad.tensor import Function
|
|
|
|
class ATan2(Function):
|
|
def forward(self, a:LazyBuffer, b:LazyBuffer) -> LazyBuffer:
|
|
assert prod(a.shape) == prod(b.shape) and a.device == b.device, "shape or device mismatch"
|
|
self.a, self.b = a, b
|
|
ast = LazyOp(LoadOps.CUSTOM, (a.contiguous(), b.contiguous()), {"GPU": atan2_gpu, "CPU": atan2_cpu}[a.device])
|
|
return create_lazybuffer(a.device, ShapeTracker(a.shape), LoadOps, ast, max(a.dtype, b.dtype))
|
|
def backward(self, grad_output:LazyBuffer) -> Tuple[Optional[LazyBuffer], Optional[LazyBuffer]]:
|
|
denom = (self.a.binary_op(BinaryOps.MUL, self.a)).binary_op(BinaryOps.ADD, self.b.binary_op(BinaryOps.MUL, self.b))
|
|
return grad_output.binary_op(BinaryOps.MUL, self.b.binary_op(BinaryOps.DIV, denom)) if self.needs_input_grad[0] else None, \
|
|
grad_output.binary_op(BinaryOps.MUL, self.a.const_like(0).binary_op(BinaryOps.SUB, self.a).binary_op(BinaryOps.DIV, denom)) if self.needs_input_grad[1] else None
|
|
|
|
# *** third, we use our lovely new mlop in some tests ***
|
|
|
|
from tinygrad.tensor import Tensor, Device
|
|
|
|
@unittest.skipUnless(Device.DEFAULT in ["CPU", "GPU"], "atan2 is only implemented for CPU and GPU")
|
|
class TestCustomFunction(unittest.TestCase):
|
|
def test_atan2_forward(self):
|
|
# create some random Tensors, permute them just because we can
|
|
a = Tensor.randn(4,4,requires_grad=True).permute(1,0)
|
|
b = Tensor.randn(4,4,requires_grad=True).permute(1,0)
|
|
|
|
# run the forward pass. note: up until the .numpy(), it's all lazy
|
|
c = ATan2.apply(a, b)
|
|
print(c.numpy())
|
|
|
|
# check the forward pass (in numpy)
|
|
np.testing.assert_allclose(c.numpy(), np.arctan2(a.numpy(), b.numpy()), atol=1e-5)
|
|
|
|
# fun fact, this never actually calls forward, so it works in all the backends
|
|
def test_atan2_backward(self):
|
|
# have to go forward before we can go backward
|
|
a = Tensor.randn(4,4,requires_grad=True).permute(1,0)
|
|
b = Tensor.randn(4,4,requires_grad=True).permute(1,0)
|
|
c = ATan2.apply(a, b)
|
|
|
|
# run the backward pass
|
|
c.mean().backward()
|
|
assert a.grad is not None and b.grad is not None, "tinygrad didn't compute gradients"
|
|
print(a.grad.numpy())
|
|
print(b.grad.numpy())
|
|
|
|
# check the backward pass (in torch)
|
|
import torch
|
|
ta, tb = torch.tensor(a.numpy(), requires_grad=True), torch.tensor(b.numpy(), requires_grad=True)
|
|
tc = torch.atan2(ta, tb)
|
|
tc.mean().backward()
|
|
assert ta.grad is not None and tb.grad is not None, "torch didn't compute gradients"
|
|
np.testing.assert_allclose(a.grad.numpy(), ta.grad.numpy(), atol=1e-5)
|
|
np.testing.assert_allclose(b.grad.numpy(), tb.grad.numpy(), atol=1e-5)
|
|
|
|
def test_atan2_jit(self):
|
|
# custom ops even work in the JIT!
|
|
from tinygrad.jit import TinyJit
|
|
|
|
@TinyJit
|
|
def jitted_atan2(a:Tensor, b:Tensor) -> Tensor:
|
|
return ATan2.apply(a, b).realize()
|
|
|
|
for _ in range(5):
|
|
a = Tensor.randn(4,4,requires_grad=True).permute(1,0)
|
|
b = Tensor.randn(4,4,requires_grad=True).permute(1,0)
|
|
c = jitted_atan2(a, b)
|
|
np.testing.assert_allclose(c.numpy(), np.arctan2(a.numpy(), b.numpy()), atol=1e-5)
|
|
|
|
if __name__ == "__main__":
|
|
unittest.main()
|