diff --git a/tinygrad/renderer/metal.py b/tinygrad/renderer/metal.py new file mode 100644 index 0000000000..5b033a408b --- /dev/null +++ b/tinygrad/renderer/metal.py @@ -0,0 +1,16 @@ +import functools +from tinygrad.renderer.cstyle import uops_to_cstyle, CStyleLanguage + +class MetalLanguage(CStyleLanguage): + kernel_prefix = "#include \nusing namespace metal;\nkernel " + buffer_prefix = "device " + smem_prefix = "threadgroup " + arg_int_prefix = "constant int&" + barrier = "threadgroup_barrier(mem_flags::mem_threadgroup);" + float4 = "float4" + uses_ptr_arithmetic=True + gid = [f"gid.{chr(120+i)}" for i in range(3)] + lid = [f"lid.{chr(120+i)}" for i in range(3)] + extra_args = ['uint3 gid [[threadgroup_position_in_grid]]', 'uint3 lid [[thread_position_in_threadgroup]]'] + +MetalRenderer = functools.partial(uops_to_cstyle, MetalLanguage()) diff --git a/tinygrad/renderer/opencl.py b/tinygrad/renderer/opencl.py new file mode 100644 index 0000000000..e4b1a13826 --- /dev/null +++ b/tinygrad/renderer/opencl.py @@ -0,0 +1,19 @@ +import functools +from tinygrad.helpers import dtypes +from tinygrad.renderer.cstyle import uops_to_cstyle, CStyleLanguage + +type_map = { dtypes.uint8: "uchar", dtypes.uint32: "uint", dtypes.uint64: "ulong" } +class OpenCLLanguage(CStyleLanguage): + kernel_prefix = "__kernel " + buffer_prefix = "__global " + smem_align = "__attribute__ ((aligned (16))) " + smem_prefix = "__local " + arg_int_prefix = "const int" + half_prekernel = "#pragma OPENCL EXTENSION cl_khr_fp16 : enable" + barrier = "barrier(CLK_LOCAL_MEM_FENCE);" + float4 = "(float4)" + gid = [f'get_group_id({i})' for i in range(3)] + lid = [f'get_local_id({i})' for i in range(3)] + uses_vload=True + +OpenCLRenderer = functools.partial(uops_to_cstyle, OpenCLLanguage()) diff --git a/tinygrad/runtime/ops_gpu.py b/tinygrad/runtime/ops_gpu.py index bda2d4bb89..6fe1a23ac3 100644 --- a/tinygrad/runtime/ops_gpu.py +++ b/tinygrad/runtime/ops_gpu.py @@ -1,13 +1,13 @@ from __future__ import annotations -import pathlib, functools +import pathlib import numpy as np import pyopencl as cl # type: ignore from typing import Optional, List from tinygrad.helpers import DEBUG, getenv, prod, ImageDType, OSX, fromimport from tinygrad.ops import Compiled +from tinygrad.renderer.opencl import OpenCLRenderer from tinygrad.runtime.lib import RawBufferCopyInOut, LRUAllocator, RawBufferTransfer from tinygrad.codegen.kernel import LinearizerOptions -from tinygrad.renderer.cstyle import uops_to_cstyle, CStyleLanguage OSX_TIMING_RATIO = (125/3) if OSX else 1.0 # see test/external/external_osx_profiling.py to determine this ratio. it's in like GPU clocks or something @@ -103,9 +103,4 @@ class CLProgram: return None return None -renderer = functools.partial(uops_to_cstyle, CStyleLanguage( - kernel_prefix = "__kernel ", buffer_prefix = "__global ", smem_align = "__attribute__ ((aligned (16))) ", smem_prefix = "__local ", arg_int_prefix = "const int", - half_prekernel = "#pragma OPENCL EXTENSION cl_khr_fp16 : enable", - barrier = "barrier(CLK_LOCAL_MEM_FENCE);", float4 = "(float4)", - gid = [f'get_group_id({i})' for i in range(3)], lid = [f'get_local_id({i})' for i in range(3)], uses_vload=True)) -GPUBuffer = Compiled(CLBuffer, LinearizerOptions(), renderer, CLProgram, CL.synchronize) +GPUBuffer = Compiled(CLBuffer, LinearizerOptions(), OpenCLRenderer, CLProgram, CL.synchronize) diff --git a/tinygrad/runtime/ops_metal.py b/tinygrad/runtime/ops_metal.py index 1af07702fa..aad17b2ab6 100644 --- a/tinygrad/runtime/ops_metal.py +++ b/tinygrad/runtime/ops_metal.py @@ -1,11 +1,11 @@ # pip3 install pyobjc-framework-Metal pyobjc-framework-Cocoa pyobjc-framework-libdispatch -import os, subprocess, pathlib, functools, ctypes +import os, subprocess, pathlib, ctypes import Metal, Cocoa, libdispatch # type: ignore from typing import List, Any, Tuple from tinygrad.codegen.kernel import LinearizerOptions -from tinygrad.renderer.cstyle import uops_to_cstyle, CStyleLanguage from tinygrad.helpers import prod, getenv, DEBUG, DType, dtypes from tinygrad.ops import Compiled, ASTRunner, BasicBatchExecutor +from tinygrad.renderer.metal import MetalRenderer from tinygrad.runtime.lib import RawBufferMapped, LRUAllocator METAL_XCODE = getenv("METAL_XCODE") @@ -100,9 +100,4 @@ class MetalProgram: return command_buffer.GPUEndTime() - command_buffer.GPUStartTime() METAL.mtl_buffers_in_flight.append(command_buffer) -renderer = functools.partial(uops_to_cstyle, CStyleLanguage( - kernel_prefix = "#include \nusing namespace metal;\nkernel ", buffer_prefix = "device ", smem_prefix = "threadgroup ", arg_int_prefix = "constant int&", - barrier = "threadgroup_barrier(mem_flags::mem_threadgroup);", float4 = "float4", uses_ptr_arithmetic=True, - gid = [f"gid.{chr(120+i)}" for i in range(3)], lid = [f"lid.{chr(120+i)}" for i in range(3)], - extra_args = ['uint3 gid [[threadgroup_position_in_grid]]', 'uint3 lid [[thread_position_in_threadgroup]]'])) -MetalBuffer = Compiled(RawMetalBuffer, LinearizerOptions(device="METAL"), renderer, MetalProgram, METAL.synchronize, MetalBatchExecutor) +MetalBuffer = Compiled(RawMetalBuffer, LinearizerOptions(device="METAL"), MetalRenderer, MetalProgram, METAL.synchronize, MetalBatchExecutor)