diff --git a/setup.py b/setup.py index 83b046172d..5fbb44e959 100644 --- a/setup.py +++ b/setup.py @@ -24,6 +24,7 @@ setup(name='tinygrad', extras_require={ 'gpu': ["pyopencl", "six"], 'llvm': ["llvmlite"], + 'cuda': ["pycuda"], 'testing': [ "pytest", "torch~=1.11.0", diff --git a/test/test_ops.py b/test/test_ops.py index f332304672..455ead12bc 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -54,6 +54,8 @@ class TestOps(unittest.TestCase): def test_add(self): helper_test_op([(45,65), (45,65)], lambda x,y: x+y, Tensor.add) + def test_add_simple(self): + helper_test_op([(256), (256)], lambda x,y: x+y, Tensor.add, forward_only=True) def test_broadcasted_add(self): helper_test_op([(45,65), (45,1)], lambda x,y: x+y, lambda x,y: x+y) def test_broadcasted_add_2(self): diff --git a/tinygrad/llops/ops_gpu.py b/tinygrad/llops/ops_gpu.py index c3eef31459..2cca5665b3 100644 --- a/tinygrad/llops/ops_gpu.py +++ b/tinygrad/llops/ops_gpu.py @@ -1,8 +1,6 @@ from __future__ import annotations -import os, functools, time, platform +import os import numpy as np -import pyopencl as cl # type: ignore -from collections import defaultdict from typing import List, Tuple, Optional, Dict, Union, Set from tinygrad.helpers import prod from tinygrad.ops import DEBUG, UnaryOps, BinaryOps, ReduceOps, MovementOps, LazyOp, Op, ExplicitExecAST, GlobalCounters @@ -11,97 +9,16 @@ from tinygrad.lazy import IMAGE from tinygrad.shape import ShapeTracker, View, ZeroView from tinygrad.shape.symbolic import Variable, ModNode -OSX = platform.system() == "Darwin" +CUDA = int(os.getenv("CUDA", "0")) +if not CUDA: from tinygrad.runtime.opencl import CLBuffer, CLImage, CLProgram, CL # NOTE: using CL will not work for the CUDA runtime # noqa: F401 +else: from tinygrad.runtime.cuda import CLBuffer, CLImage, CLProgram # type: ignore + VALIDHACKS = int(os.getenv("VALIDHACKS", "0")) # TODO: remove the need for this NATIVE_EXPLOG = int(os.getenv("NATIVE_EXPLOG", "0")) # this is needed as a switch for the tests to pass -CLCACHE = int(os.getenv("CLCACHE", "1")) -FLOAT16 = int(os.getenv("FLOAT16", "0")) PRINT_AST = os.getenv("PRINT_AST", "0") TEST_AST = int(os.getenv("TEST_AST", "0")) -class CLBuffer: - def __init__(self, size): - if len(CL.BUFFER_CACHE[size]) > 0: - self.cl = CL.BUFFER_CACHE[size].pop() - else: - # TODO: on GPU OOM, clear the cache - self.cl = cl.Buffer(CL().cl_ctx, cl.mem_flags.READ_WRITE, size) - CL.mem_used += self.cl.size - - def __del__(self): - if CLCACHE: - CL.BUFFER_CACHE[self.cl.size].append(self.cl) - else: - CL.mem_used -= self.cl.size - -class CLImage: - fmt = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.HALF_FLOAT if FLOAT16 else cl.channel_type.FLOAT) - - def __init__(self, shape): - self.cl = cl.Image(CL().cl_ctx, cl.mem_flags.READ_WRITE, CLImage.fmt, shape=(shape[1], shape[0])) - CL.mem_used += self.cl.row_pitch * self.cl.height - - def __del__(self): - CL.mem_used -= self.cl.row_pitch * self.cl.height - -class CL: - CACHE, kernel_count, mem_used, time_sum, ops_sum = None, -1, 0, 0.0, 0.0 - BUFFER_CACHE : Dict[int, List[cl.Buffer]] = defaultdict(list) - cl_ctx : Optional[cl.Context] = None - cl_queue : Optional[cl.CommandQueue] = None - def __init__(self): - if CL.cl_queue is not None: return # already initted - devices = sum([x.get_devices(device_type=cl.device_type.GPU) for x in cl.get_platforms()], []) - if len(devices) == 0: # settle for CPU - devices = sum([x.get_devices(device_type=cl.device_type.CPU) for x in cl.get_platforms()], []) - CL.cl_ctx = cl.Context(devices=[devices[int(os.getenv("CL_DEVICE", "0"))]]) - if len(devices) > 1 or DEBUG >= 1: print(f"using {CL.cl_ctx.devices}") - CL.cl_queue = cl.CommandQueue(self.cl_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) # this is an in-order command queue - - @staticmethod - def enqueue_copy(a, b, is_blocking=False): - if CL.CACHE is not None: assert False, f"can't copy {a} -> {b} while caching" - if DEBUG >= 1: print(f"**CL** copy in {b.shape}" if isinstance(b, np.ndarray) else f"**CL** copy OUT {a.shape}") - cl.enqueue_copy(CL().cl_queue, a, b, is_blocking=is_blocking) - -@functools.lru_cache(maxsize=None) -class CLProgram: - kernel_cnt : Dict[str, int] = defaultdict(int) - def __init__(self, name:str, prg:str, options:Tuple[str, ...]=tuple(), argdtypes=None, rename=True, binary=False, op_estimate=0): - self.name = f"{name}{('_N'+str(CLProgram.kernel_cnt[name])) if CLProgram.kernel_cnt[name] else ''}" if rename else name - self.prg, self.options, self.argdtypes, self.op_estimate = prg.replace(f"{name}(", f"{self.name}(") if rename else prg, options, argdtypes, op_estimate - self.clprogram = cl.Program(CL().cl_ctx, CL().cl_ctx.devices, [self.prg]) if binary else cl.Program(CL().cl_ctx, self.prg) # type: ignore - try: - self.clprg = self.clprogram.build(options=list(self.options)).__getattr__(self.name) - except cl.RuntimeError as e: - if DEBUG >= 3: print("FAILED TO BUILD", self.prg) - raise e - if self.argdtypes is not None: - self.clprg.set_scalar_arg_dtypes(self.argdtypes) - CLProgram.kernel_cnt[name] += 1 - def __call__(self, *args): - CL.kernel_count += 1 - if DEBUG >= 4: print(args[0], args[1], self.prg) - if OSX and DEBUG >= 2: st = time.monotonic_ns() - if CL.CACHE is not None: CL.CACHE.append((self, args)) - else: e = self.clprg(CL().cl_queue, *args) - if DEBUG >= 2: - CL.cl_queue.finish() - # NOTE: Profiling is (sadly) broken in OS X, so we take the real kernel time - # BOUNTY: will paypal $50 to anyone who fixes this - et = (time.monotonic_ns() - st) if OSX else (e.profile.end - e.profile.start) - if DEBUG >= 1: - CL.time_sum += 0 if DEBUG <= 1 or CL.CACHE is not None else et - CL.ops_sum += self.op_estimate - print(f"**CL** {CL.kernel_count:6d} {self.name:28s} args {len(args[2:]):5d} kernels {str(args[0]):18s} {str(args[1]):12s} OPs {self.op_estimate/1e6:7.1f}M/{CL.ops_sum/1e9:7.2f}G mem {CL.mem_used/1e9:5.2f} GB " + - (str() if DEBUG <= 1 or CL.CACHE is not None else f"tm {et/1e3:9.2f}us/{CL.time_sum/1e6:9.2f}ms ({self.op_estimate/et:8.2f} GFLOPS)")) - GlobalCounters.global_ops += self.op_estimate - GlobalCounters.global_mem += sum([x.size//4 for x in args[2:] if isinstance(x, cl.Buffer)]) - return e if CL.CACHE is None else None - -# **** end CL wrappers **** - def group_float4(x): assert all(y.typ == Types.FLOAT for y in x) and len(x)%4 == 0 return [Token(f"(float4)({','.join([x[i+j].tok for j in range(4)])})", Types.FLOAT4) for i in range(0, len(x), 4)] @@ -192,6 +109,7 @@ class CLASTKernel(ASTKernel): if isinstance(x.op, ReduceOps) and not do_reduce: return acc values = ([acc] if isinstance(x.op, ReduceOps) else []) + [self.ast_parse(v, acc, do_reduce) for v in x.src] code = CLASTKernel.code_for_op[x.op] # TODO: replace this with a function + if CUDA and x.op == UnaryOps.SIGN: self.prekernel.add("inline __device__ float sign(float x) { float val = (signbit(x) == 0.0f) ? 1.0f : -1.0f; return (x == 0.0f) ? 0.0f : val; }") if len(values) == 2: # TODO: sometimes this is split, sometimes it's multiply if isinstance(x.op, ReduceOps) and values[0][0].typ == Types.FLOAT4 and len(values[0])*4 == len(values[1]): values[0] = split_float4(values[0]) @@ -317,7 +235,7 @@ class CLASTKernel(ASTKernel): print("old:", self.shapes) print("old:", self.strides) - self.hand_coded_optimizations() + if not CUDA: self.hand_coded_optimizations() self.output_shape = list(self.shapes[0][:self.first_reduce]) + self.group_for_reduce if DEBUG >= 3: @@ -330,18 +248,18 @@ class CLASTKernel(ASTKernel): self.loaded_keys : Dict[Tuple[int,int], Token] = {} self.prekernel : Set[str] = set() - self.kernel : List[str] = ["const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n"] + self.kernel : List[str] = ["const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n"] if any(isinstance(buf._buf, CLImage) for buf in self.bufs) else [] # output_shape[-1] is get_global_id(0) - self.kernel += [f"int idx{len(self.output_shape)-1-i} = get_global_id({i}); /* {self.output_shape[-1-i]} */\n" for i in range(min(3, len(self.output_shape)))] - - if len(self.output_shape) > 3: + MAX_OUTPUT_SHAPE = 3 + self.kernel += [f"int idx{len(self.output_shape)-1-i} = {f'blockDim.{chr(120+i)}*blockIdx.{chr(120+i)}+threadIdx.{chr(120+i)}' if CUDA else f'get_global_id({i})'}; /* {self.output_shape[-1-i]} */\n" for i in range(min(MAX_OUTPUT_SHAPE, len(self.output_shape))) if self.output_shape[-1-i] != 1] + if len(self.output_shape) > MAX_OUTPUT_SHAPE: # sometimes, there's more dimensions. compact all the dimensions into the first one # TODO: these compactions should be searchable - final_dimension = len(self.output_shape)-3 - for i in range(len(self.output_shape)-4, -1, -1): + final_dimension = len(self.output_shape)-MAX_OUTPUT_SHAPE + for i in range(final_dimension-1, -1, -1): self.kernel += [f"int idx{i} = idx{final_dimension} % {self.output_shape[i]};", f"idx{final_dimension} = idx{final_dimension} / {self.output_shape[i]};\n"] - self.output_shape = [prod(self.output_shape[0:-2])] + list(self.output_shape[-2:]) + self.output_shape = [prod(self.output_shape[0:final_dimension+1])] + list(self.output_shape[final_dimension+1:]) if DEBUG >= 3: print(f"replaced output shape with {self.output_shape}") # early ast @@ -383,16 +301,19 @@ class CLASTKernel(ASTKernel): # kernel function definition function_name = ("re_S" if self.reduceop else "ew_S") + '_'.join([str(x) for x in self.bufs[0].shape if x != 1]) - buftypes = [f"{'read_only' if i > 0 else 'write_only'} image2d_t" if isinstance(x._buf, CLImage) else ("__global "+self.buftokens[i].decltype()) for i,x in enumerate(self.bufs)] - self.kernel = list(self.prekernel) + [f"__kernel void {function_name}(",] + \ + buftypes = [f"{'read_only' if i > 0 else 'write_only'} image2d_t" if isinstance(x._buf, CLImage) else ("__global "+self.buftokens[i].decltype()) for i,x in enumerate(self.bufs)] if not CUDA else [self.buftokens[i].decltype() for i,x in enumerate(self.bufs)] + self.kernel = list(self.prekernel) + [f"{'__global__' if CUDA else '__kernel'} void {function_name}(",] + \ [', '.join([f'{t} data{i}' for i,t in enumerate(buftypes) if i not in self.bufs_to_delete])] + \ [") {\n"] + self.kernel # compile kernel self.fxn = CLProgram(function_name, ' '.join(self.kernel), op_estimate=self.info.flops) + mem_estimate = sum(prod(x) for x in self.shapes) if DEBUG >= 3 and len(self.bufs_to_delete): print(f"deleting buffers {self.bufs_to_delete}") def runner(*bufs): + GlobalCounters.global_ops += self.info.flops + GlobalCounters.global_mem += mem_estimate clbufs = [x.cl for i,x in enumerate(bufs) if i not in self.bufs_to_delete] return self.fxn(self.output_shape[::-1] if len(self.output_shape) > 0 else [1], (self.group_for_reduce[::-1] + [1]*(len(self.output_shape)-len(self.group_for_reduce))) if self.group_for_reduce else None, *clbufs) return runner @@ -418,7 +339,7 @@ class GPUBuffer(ExplicitExecAST): if self._buf is None: self._buf = CLImage(self._base_shape) if (len(self._base_shape) == 3 and self._base_shape[2] == 4 and IMAGE >= 2) else CLBuffer(4*prod(self._base_shape)) if self._backing is not None: - CL().enqueue_copy(self._buf.cl, self._backing, is_blocking=False) + self._buf.copyin(self._backing) self._backing = None return self._buf.cl @@ -431,7 +352,7 @@ class GPUBuffer(ExplicitExecAST): data = np.empty(self.shape, dtype=np.float32) cl_buf = self.contiguous() cl_buf = cl_buf if isinstance(cl_buf._buf, CLBuffer) else self.movement_op(MovementOps.RESHAPE, list(self.shape)+[1]).unary_op(UnaryOps.NOOP) - CL().enqueue_copy(data, cl_buf.cl, is_blocking=True) + cl_buf._buf.copyout(data) return data @classmethod diff --git a/tinygrad/runtime/cuda.py b/tinygrad/runtime/cuda.py new file mode 100644 index 0000000000..dc1fe7c5f6 --- /dev/null +++ b/tinygrad/runtime/cuda.py @@ -0,0 +1,24 @@ +import pycuda.autoinit # type: ignore # pylint: disable=unused-import # noqa: F401 +import pycuda.driver as cuda # type: ignore +from pycuda.compiler import SourceModule # type: ignore +import numpy as np +from tinygrad.ops import DEBUG + +class CLImage: + def __init__(self, shape): raise NotImplementedError("CUDA runtime doesn't support images") + +class CLBuffer: + def __init__(self, size): self.cl = cuda.mem_alloc(size) + def copyin(self, b:np.ndarray): cuda.memcpy_htod_async(self.cl, b) + def copyout(self, a:np.ndarray): cuda.memcpy_dtoh(a, self.cl) + +class CLProgram: + def __init__(self, name:str, prg:str, op_estimate:int=0): + self.name, self.op_estimate = name, op_estimate + if DEBUG >= 4: print("CUDA compile", prg) + self.prg = SourceModule(prg).get_function(name) + + def __call__(self, global_size, local_size, *args): + global_size = global_size + [1] * (2 - len(global_size)) + if DEBUG >= 2: print("CUDA launch", global_size, local_size) + self.prg(*args, block=(1,1,1), grid=tuple(global_size)) \ No newline at end of file diff --git a/tinygrad/runtime/opencl.py b/tinygrad/runtime/opencl.py new file mode 100644 index 0000000000..c1a18167f4 --- /dev/null +++ b/tinygrad/runtime/opencl.py @@ -0,0 +1,89 @@ +import os, functools, time, platform +import numpy as np +import pyopencl as cl # type: ignore +from typing import Dict, Optional, Tuple, List +from collections import defaultdict +from tinygrad.ops import DEBUG + +OSX = platform.system() == "Darwin" +CLCACHE = int(os.getenv("CLCACHE", "1")) +FLOAT16 = int(os.getenv("FLOAT16", "0")) + +class CL: + CACHE, kernel_count, mem_used, time_sum, ops_sum = None, -1, 0, 0.0, 0.0 + BUFFER_CACHE : Dict[int, List[cl.Buffer]] = defaultdict(list) + cl_ctx : Optional[cl.Context] = None + cl_queue : Optional[cl.CommandQueue] = None + def __init__(self): + if CL.cl_queue is not None: return # already initted + devices = sum([x.get_devices(device_type=cl.device_type.GPU) for x in cl.get_platforms()], []) + if len(devices) == 0: # settle for CPU + devices = sum([x.get_devices(device_type=cl.device_type.CPU) for x in cl.get_platforms()], []) + CL.cl_ctx = cl.Context(devices=[devices[int(os.getenv("CL_DEVICE", "0"))]]) + if len(devices) > 1 or DEBUG >= 1: print(f"using {CL.cl_ctx.devices}") + CL.cl_queue = cl.CommandQueue(self.cl_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) # this is an in-order command queue + + @staticmethod + def enqueue_copy(a, b, is_blocking=False): + if CL.CACHE is not None: assert False, f"can't copy {a} -> {b} while caching" + if DEBUG >= 1: print(f"**CL** copy in {b.shape}" if isinstance(b, np.ndarray) else f"**CL** copy OUT {a.shape}") + cl.enqueue_copy(CL().cl_queue, a, b, is_blocking=is_blocking) + +class CLBuffer: + def __init__(self, size): + if len(CL.BUFFER_CACHE[size]) > 0: + self.cl = CL.BUFFER_CACHE[size].pop() + else: + # TODO: on GPU OOM, clear the cache + self.cl = cl.Buffer(CL().cl_ctx, cl.mem_flags.READ_WRITE, size) + CL.mem_used += self.cl.size + + def __del__(self): + if CLCACHE: CL.BUFFER_CACHE[self.cl.size].append(self.cl) + else: CL.mem_used -= self.cl.size + + def copyin(self, b:np.ndarray): CL.enqueue_copy(self.cl, b, False) + def copyout(self, a:np.ndarray): CL.enqueue_copy(a, self.cl, True) + +class CLImage: + fmt = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.HALF_FLOAT if FLOAT16 else cl.channel_type.FLOAT) + + def __init__(self, shape): + self.cl = cl.Image(CL().cl_ctx, cl.mem_flags.READ_WRITE, CLImage.fmt, shape=(shape[1], shape[0])) + CL.mem_used += self.cl.row_pitch * self.cl.height + + def __del__(self): + CL.mem_used -= self.cl.row_pitch * self.cl.height + +@functools.lru_cache(maxsize=None) +class CLProgram: + kernel_cnt : Dict[str, int] = defaultdict(int) + def __init__(self, name:str, prg:str, options:Tuple[str, ...]=tuple(), argdtypes=None, rename=True, binary=False, op_estimate=0): + self.name = f"{name}{('_N'+str(CLProgram.kernel_cnt[name])) if CLProgram.kernel_cnt[name] else ''}" if rename else name + self.prg, self.options, self.argdtypes, self.op_estimate = prg.replace(f"{name}(", f"{self.name}(") if rename else prg, options, argdtypes, op_estimate + self.clprogram = cl.Program(CL().cl_ctx, CL().cl_ctx.devices, [self.prg]) if binary else cl.Program(CL().cl_ctx, self.prg) # type: ignore + try: + self.clprg = self.clprogram.build(options=list(self.options)).__getattr__(self.name) + except cl.RuntimeError as e: + if DEBUG >= 3: print("FAILED TO BUILD", self.prg) + raise e + if self.argdtypes is not None: + self.clprg.set_scalar_arg_dtypes(self.argdtypes) + CLProgram.kernel_cnt[name] += 1 + def __call__(self, *args): + CL.kernel_count += 1 + if DEBUG >= 4: print(args[0], args[1], self.prg) + if OSX and DEBUG >= 2: st = time.monotonic_ns() + if CL.CACHE is not None: CL.CACHE.append((self, args)) + else: e = self.clprg(CL().cl_queue, *args) + if DEBUG >= 2: + CL.cl_queue.finish() + # NOTE: Profiling is (sadly) broken in OS X, so we take the real kernel time + # BOUNTY: will paypal $50 to anyone who fixes this + et = (time.monotonic_ns() - st) if OSX else (e.profile.end - e.profile.start) + if DEBUG >= 1: + CL.time_sum += 0 if DEBUG <= 1 or CL.CACHE is not None else et + CL.ops_sum += self.op_estimate + print(f"**CL** {CL.kernel_count:6d} {self.name:28s} args {len(args[2:]):5d} kernels {str(args[0]):18s} {str(args[1]):12s} OPs {self.op_estimate/1e6:7.1f}M/{CL.ops_sum/1e9:7.2f}G mem {CL.mem_used/1e9:5.2f} GB " + + (str() if DEBUG <= 1 or CL.CACHE is not None else f"tm {et/1e3:9.2f}us/{CL.time_sum/1e6:9.2f}ms ({self.op_estimate/et:8.2f} GFLOPS)")) + return e if CL.CACHE is None else None