From d8aba24967c9b71588b7c49ceeb29598c1917e5d Mon Sep 17 00:00:00 2001 From: qazal <77887910+Qazalin@users.noreply.github.com> Date: Sun, 11 Jan 2026 04:25:16 -0500 Subject: [PATCH] amd: use kernel descriptor struct in AMDProgram (#14096) --- tinygrad/runtime/autogen/__init__.py | 2 + tinygrad/runtime/autogen/amdgpu_kd.py | 283 ++++++++++++++++++++++++++ tinygrad/runtime/ops_amd.py | 26 +-- 3 files changed, 298 insertions(+), 13 deletions(-) create mode 100644 tinygrad/runtime/autogen/amdgpu_kd.py diff --git a/tinygrad/runtime/autogen/__init__.py b/tinygrad/runtime/autogen/__init__.py index 29d29f9b47..5617e73f6b 100644 --- a/tinygrad/runtime/autogen/__init__.py +++ b/tinygrad/runtime/autogen/__init__.py @@ -100,6 +100,8 @@ def __getattr__(nm): "amd_hsa_kernel_code", "hsa_ext_finalize", "hsa_ext_image", "hsa_ven_amd_aqlprofile"]]], tarball=rocr_src, args=["-DLITTLEENDIAN_CPU"], prolog=["import os"]) + case "amdgpu_kd": return load("amdgpu_kd", None, lambda: [f"{system('llvm-config-20 --includedir')}/llvm/Support/AMDHSAKernelDescriptor.h"], + args=lambda: system("llvm-config-20 --cflags").split() + ["-x", "c++"], recsym=True, parse_macros=False) case "amd_gpu": return load("amd_gpu", None, [root/f"extra/hip_gpu_driver/{s}.h" for s in ["sdma_registers", "nvd", "gc_11_0_0_offset", "sienna_cichlid_ip_offset"]], args=["-I/opt/rocm/include", "-x", "c++"]) diff --git a/tinygrad/runtime/autogen/amdgpu_kd.py b/tinygrad/runtime/autogen/amdgpu_kd.py new file mode 100644 index 0000000000..05be65a6f8 --- /dev/null +++ b/tinygrad/runtime/autogen/amdgpu_kd.py @@ -0,0 +1,283 @@ +# mypy: ignore-errors +import ctypes +from tinygrad.runtime.support.c import DLL, Struct, CEnum, _IO, _IOW, _IOR, _IOWR +uint8_t = ctypes.c_ubyte +_anonenum0 = CEnum(uint8_t) +FLOAT_ROUND_MODE_NEAR_EVEN = _anonenum0.define('FLOAT_ROUND_MODE_NEAR_EVEN', 0) +FLOAT_ROUND_MODE_PLUS_INFINITY = _anonenum0.define('FLOAT_ROUND_MODE_PLUS_INFINITY', 1) +FLOAT_ROUND_MODE_MINUS_INFINITY = _anonenum0.define('FLOAT_ROUND_MODE_MINUS_INFINITY', 2) +FLOAT_ROUND_MODE_ZERO = _anonenum0.define('FLOAT_ROUND_MODE_ZERO', 3) + +_anonenum1 = CEnum(uint8_t) +FLOAT_DENORM_MODE_FLUSH_SRC_DST = _anonenum1.define('FLOAT_DENORM_MODE_FLUSH_SRC_DST', 0) +FLOAT_DENORM_MODE_FLUSH_DST = _anonenum1.define('FLOAT_DENORM_MODE_FLUSH_DST', 1) +FLOAT_DENORM_MODE_FLUSH_SRC = _anonenum1.define('FLOAT_DENORM_MODE_FLUSH_SRC', 2) +FLOAT_DENORM_MODE_FLUSH_NONE = _anonenum1.define('FLOAT_DENORM_MODE_FLUSH_NONE', 3) + +_anonenum2 = CEnum(uint8_t) +SYSTEM_VGPR_WORKITEM_ID_X = _anonenum2.define('SYSTEM_VGPR_WORKITEM_ID_X', 0) +SYSTEM_VGPR_WORKITEM_ID_X_Y = _anonenum2.define('SYSTEM_VGPR_WORKITEM_ID_X_Y', 1) +SYSTEM_VGPR_WORKITEM_ID_X_Y_Z = _anonenum2.define('SYSTEM_VGPR_WORKITEM_ID_X_Y_Z', 2) +SYSTEM_VGPR_WORKITEM_ID_UNDEFINED = _anonenum2.define('SYSTEM_VGPR_WORKITEM_ID_UNDEFINED', 3) + +int32_t = ctypes.c_int32 +_anonenum3 = CEnum(int32_t) +COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT_SHIFT = _anonenum3.define('COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT_SHIFT', 0) +COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT_WIDTH = _anonenum3.define('COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT_WIDTH', 6) +COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT = _anonenum3.define('COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT', 63) +COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT_SHIFT = _anonenum3.define('COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT_SHIFT', 6) +COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT_WIDTH = _anonenum3.define('COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT_WIDTH', 4) +COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT = _anonenum3.define('COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT', 960) +COMPUTE_PGM_RSRC1_PRIORITY_SHIFT = _anonenum3.define('COMPUTE_PGM_RSRC1_PRIORITY_SHIFT', 10) +COMPUTE_PGM_RSRC1_PRIORITY_WIDTH = _anonenum3.define('COMPUTE_PGM_RSRC1_PRIORITY_WIDTH', 2) +COMPUTE_PGM_RSRC1_PRIORITY = _anonenum3.define('COMPUTE_PGM_RSRC1_PRIORITY', 3072) +COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_32_SHIFT = _anonenum3.define('COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_32_SHIFT', 12) +COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_32_WIDTH = _anonenum3.define('COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_32_WIDTH', 2) +COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_32 = _anonenum3.define('COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_32', 12288) +COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_16_64_SHIFT = _anonenum3.define('COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_16_64_SHIFT', 14) +COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_16_64_WIDTH = _anonenum3.define('COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_16_64_WIDTH', 2) +COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_16_64 = _anonenum3.define('COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_16_64', 49152) +COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_32_SHIFT = _anonenum3.define('COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_32_SHIFT', 16) +COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_32_WIDTH = _anonenum3.define('COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_32_WIDTH', 2) +COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_32 = _anonenum3.define('COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_32', 196608) +COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64_SHIFT = _anonenum3.define('COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64_SHIFT', 18) +COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64_WIDTH = _anonenum3.define('COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64_WIDTH', 2) +COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64 = _anonenum3.define('COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64', 786432) +COMPUTE_PGM_RSRC1_PRIV_SHIFT = _anonenum3.define('COMPUTE_PGM_RSRC1_PRIV_SHIFT', 20) +COMPUTE_PGM_RSRC1_PRIV_WIDTH = _anonenum3.define('COMPUTE_PGM_RSRC1_PRIV_WIDTH', 1) +COMPUTE_PGM_RSRC1_PRIV = _anonenum3.define('COMPUTE_PGM_RSRC1_PRIV', 1048576) +COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_DX10_CLAMP_SHIFT = _anonenum3.define('COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_DX10_CLAMP_SHIFT', 21) +COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_DX10_CLAMP_WIDTH = _anonenum3.define('COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_DX10_CLAMP_WIDTH', 1) +COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_DX10_CLAMP = _anonenum3.define('COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_DX10_CLAMP', 2097152) +COMPUTE_PGM_RSRC1_GFX12_PLUS_ENABLE_WG_RR_EN_SHIFT = _anonenum3.define('COMPUTE_PGM_RSRC1_GFX12_PLUS_ENABLE_WG_RR_EN_SHIFT', 21) +COMPUTE_PGM_RSRC1_GFX12_PLUS_ENABLE_WG_RR_EN_WIDTH = _anonenum3.define('COMPUTE_PGM_RSRC1_GFX12_PLUS_ENABLE_WG_RR_EN_WIDTH', 1) +COMPUTE_PGM_RSRC1_GFX12_PLUS_ENABLE_WG_RR_EN = _anonenum3.define('COMPUTE_PGM_RSRC1_GFX12_PLUS_ENABLE_WG_RR_EN', 2097152) +COMPUTE_PGM_RSRC1_DEBUG_MODE_SHIFT = _anonenum3.define('COMPUTE_PGM_RSRC1_DEBUG_MODE_SHIFT', 22) +COMPUTE_PGM_RSRC1_DEBUG_MODE_WIDTH = _anonenum3.define('COMPUTE_PGM_RSRC1_DEBUG_MODE_WIDTH', 1) +COMPUTE_PGM_RSRC1_DEBUG_MODE = _anonenum3.define('COMPUTE_PGM_RSRC1_DEBUG_MODE', 4194304) +COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_IEEE_MODE_SHIFT = _anonenum3.define('COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_IEEE_MODE_SHIFT', 23) +COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_IEEE_MODE_WIDTH = _anonenum3.define('COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_IEEE_MODE_WIDTH', 1) +COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_IEEE_MODE = _anonenum3.define('COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_IEEE_MODE', 8388608) +COMPUTE_PGM_RSRC1_GFX12_PLUS_DISABLE_PERF_SHIFT = _anonenum3.define('COMPUTE_PGM_RSRC1_GFX12_PLUS_DISABLE_PERF_SHIFT', 23) +COMPUTE_PGM_RSRC1_GFX12_PLUS_DISABLE_PERF_WIDTH = _anonenum3.define('COMPUTE_PGM_RSRC1_GFX12_PLUS_DISABLE_PERF_WIDTH', 1) +COMPUTE_PGM_RSRC1_GFX12_PLUS_DISABLE_PERF = _anonenum3.define('COMPUTE_PGM_RSRC1_GFX12_PLUS_DISABLE_PERF', 8388608) +COMPUTE_PGM_RSRC1_BULKY_SHIFT = _anonenum3.define('COMPUTE_PGM_RSRC1_BULKY_SHIFT', 24) +COMPUTE_PGM_RSRC1_BULKY_WIDTH = _anonenum3.define('COMPUTE_PGM_RSRC1_BULKY_WIDTH', 1) +COMPUTE_PGM_RSRC1_BULKY = _anonenum3.define('COMPUTE_PGM_RSRC1_BULKY', 16777216) +COMPUTE_PGM_RSRC1_CDBG_USER_SHIFT = _anonenum3.define('COMPUTE_PGM_RSRC1_CDBG_USER_SHIFT', 25) +COMPUTE_PGM_RSRC1_CDBG_USER_WIDTH = _anonenum3.define('COMPUTE_PGM_RSRC1_CDBG_USER_WIDTH', 1) +COMPUTE_PGM_RSRC1_CDBG_USER = _anonenum3.define('COMPUTE_PGM_RSRC1_CDBG_USER', 33554432) +COMPUTE_PGM_RSRC1_GFX6_GFX8_RESERVED0_SHIFT = _anonenum3.define('COMPUTE_PGM_RSRC1_GFX6_GFX8_RESERVED0_SHIFT', 26) +COMPUTE_PGM_RSRC1_GFX6_GFX8_RESERVED0_WIDTH = _anonenum3.define('COMPUTE_PGM_RSRC1_GFX6_GFX8_RESERVED0_WIDTH', 1) +COMPUTE_PGM_RSRC1_GFX6_GFX8_RESERVED0 = _anonenum3.define('COMPUTE_PGM_RSRC1_GFX6_GFX8_RESERVED0', 67108864) +COMPUTE_PGM_RSRC1_GFX9_PLUS_FP16_OVFL_SHIFT = _anonenum3.define('COMPUTE_PGM_RSRC1_GFX9_PLUS_FP16_OVFL_SHIFT', 26) +COMPUTE_PGM_RSRC1_GFX9_PLUS_FP16_OVFL_WIDTH = _anonenum3.define('COMPUTE_PGM_RSRC1_GFX9_PLUS_FP16_OVFL_WIDTH', 1) +COMPUTE_PGM_RSRC1_GFX9_PLUS_FP16_OVFL = _anonenum3.define('COMPUTE_PGM_RSRC1_GFX9_PLUS_FP16_OVFL', 67108864) +COMPUTE_PGM_RSRC1_RESERVED1_SHIFT = _anonenum3.define('COMPUTE_PGM_RSRC1_RESERVED1_SHIFT', 27) +COMPUTE_PGM_RSRC1_RESERVED1_WIDTH = _anonenum3.define('COMPUTE_PGM_RSRC1_RESERVED1_WIDTH', 2) +COMPUTE_PGM_RSRC1_RESERVED1 = _anonenum3.define('COMPUTE_PGM_RSRC1_RESERVED1', 402653184) +COMPUTE_PGM_RSRC1_GFX6_GFX9_RESERVED2_SHIFT = _anonenum3.define('COMPUTE_PGM_RSRC1_GFX6_GFX9_RESERVED2_SHIFT', 29) +COMPUTE_PGM_RSRC1_GFX6_GFX9_RESERVED2_WIDTH = _anonenum3.define('COMPUTE_PGM_RSRC1_GFX6_GFX9_RESERVED2_WIDTH', 3) +COMPUTE_PGM_RSRC1_GFX6_GFX9_RESERVED2 = _anonenum3.define('COMPUTE_PGM_RSRC1_GFX6_GFX9_RESERVED2', -536870912) +COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE_SHIFT = _anonenum3.define('COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE_SHIFT', 29) +COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE_WIDTH = _anonenum3.define('COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE_WIDTH', 1) +COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE = _anonenum3.define('COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE', 536870912) +COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED_SHIFT = _anonenum3.define('COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED_SHIFT', 30) +COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED_WIDTH = _anonenum3.define('COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED_WIDTH', 1) +COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED = _anonenum3.define('COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED', 1073741824) +COMPUTE_PGM_RSRC1_GFX10_PLUS_FWD_PROGRESS_SHIFT = _anonenum3.define('COMPUTE_PGM_RSRC1_GFX10_PLUS_FWD_PROGRESS_SHIFT', 31) +COMPUTE_PGM_RSRC1_GFX10_PLUS_FWD_PROGRESS_WIDTH = _anonenum3.define('COMPUTE_PGM_RSRC1_GFX10_PLUS_FWD_PROGRESS_WIDTH', 1) +COMPUTE_PGM_RSRC1_GFX10_PLUS_FWD_PROGRESS = _anonenum3.define('COMPUTE_PGM_RSRC1_GFX10_PLUS_FWD_PROGRESS', -2147483648) + +_anonenum4 = CEnum(int32_t) +COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT_SHIFT = _anonenum4.define('COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT_SHIFT', 0) +COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT_WIDTH = _anonenum4.define('COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT_WIDTH', 1) +COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT = _anonenum4.define('COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT', 1) +COMPUTE_PGM_RSRC2_USER_SGPR_COUNT_SHIFT = _anonenum4.define('COMPUTE_PGM_RSRC2_USER_SGPR_COUNT_SHIFT', 1) +COMPUTE_PGM_RSRC2_USER_SGPR_COUNT_WIDTH = _anonenum4.define('COMPUTE_PGM_RSRC2_USER_SGPR_COUNT_WIDTH', 5) +COMPUTE_PGM_RSRC2_USER_SGPR_COUNT = _anonenum4.define('COMPUTE_PGM_RSRC2_USER_SGPR_COUNT', 62) +COMPUTE_PGM_RSRC2_GFX6_GFX11_ENABLE_TRAP_HANDLER_SHIFT = _anonenum4.define('COMPUTE_PGM_RSRC2_GFX6_GFX11_ENABLE_TRAP_HANDLER_SHIFT', 6) +COMPUTE_PGM_RSRC2_GFX6_GFX11_ENABLE_TRAP_HANDLER_WIDTH = _anonenum4.define('COMPUTE_PGM_RSRC2_GFX6_GFX11_ENABLE_TRAP_HANDLER_WIDTH', 1) +COMPUTE_PGM_RSRC2_GFX6_GFX11_ENABLE_TRAP_HANDLER = _anonenum4.define('COMPUTE_PGM_RSRC2_GFX6_GFX11_ENABLE_TRAP_HANDLER', 64) +COMPUTE_PGM_RSRC2_GFX12_PLUS_RESERVED1_SHIFT = _anonenum4.define('COMPUTE_PGM_RSRC2_GFX12_PLUS_RESERVED1_SHIFT', 6) +COMPUTE_PGM_RSRC2_GFX12_PLUS_RESERVED1_WIDTH = _anonenum4.define('COMPUTE_PGM_RSRC2_GFX12_PLUS_RESERVED1_WIDTH', 1) +COMPUTE_PGM_RSRC2_GFX12_PLUS_RESERVED1 = _anonenum4.define('COMPUTE_PGM_RSRC2_GFX12_PLUS_RESERVED1', 64) +COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X_SHIFT = _anonenum4.define('COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X_SHIFT', 7) +COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X_WIDTH = _anonenum4.define('COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X_WIDTH', 1) +COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X = _anonenum4.define('COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X', 128) +COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Y_SHIFT = _anonenum4.define('COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Y_SHIFT', 8) +COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Y_WIDTH = _anonenum4.define('COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Y_WIDTH', 1) +COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Y = _anonenum4.define('COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Y', 256) +COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Z_SHIFT = _anonenum4.define('COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Z_SHIFT', 9) +COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Z_WIDTH = _anonenum4.define('COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Z_WIDTH', 1) +COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Z = _anonenum4.define('COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Z', 512) +COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_INFO_SHIFT = _anonenum4.define('COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_INFO_SHIFT', 10) +COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_INFO_WIDTH = _anonenum4.define('COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_INFO_WIDTH', 1) +COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_INFO = _anonenum4.define('COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_INFO', 1024) +COMPUTE_PGM_RSRC2_ENABLE_VGPR_WORKITEM_ID_SHIFT = _anonenum4.define('COMPUTE_PGM_RSRC2_ENABLE_VGPR_WORKITEM_ID_SHIFT', 11) +COMPUTE_PGM_RSRC2_ENABLE_VGPR_WORKITEM_ID_WIDTH = _anonenum4.define('COMPUTE_PGM_RSRC2_ENABLE_VGPR_WORKITEM_ID_WIDTH', 2) +COMPUTE_PGM_RSRC2_ENABLE_VGPR_WORKITEM_ID = _anonenum4.define('COMPUTE_PGM_RSRC2_ENABLE_VGPR_WORKITEM_ID', 6144) +COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_ADDRESS_WATCH_SHIFT = _anonenum4.define('COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_ADDRESS_WATCH_SHIFT', 13) +COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_ADDRESS_WATCH_WIDTH = _anonenum4.define('COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_ADDRESS_WATCH_WIDTH', 1) +COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_ADDRESS_WATCH = _anonenum4.define('COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_ADDRESS_WATCH', 8192) +COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_MEMORY_SHIFT = _anonenum4.define('COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_MEMORY_SHIFT', 14) +COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_MEMORY_WIDTH = _anonenum4.define('COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_MEMORY_WIDTH', 1) +COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_MEMORY = _anonenum4.define('COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_MEMORY', 16384) +COMPUTE_PGM_RSRC2_GRANULATED_LDS_SIZE_SHIFT = _anonenum4.define('COMPUTE_PGM_RSRC2_GRANULATED_LDS_SIZE_SHIFT', 15) +COMPUTE_PGM_RSRC2_GRANULATED_LDS_SIZE_WIDTH = _anonenum4.define('COMPUTE_PGM_RSRC2_GRANULATED_LDS_SIZE_WIDTH', 9) +COMPUTE_PGM_RSRC2_GRANULATED_LDS_SIZE = _anonenum4.define('COMPUTE_PGM_RSRC2_GRANULATED_LDS_SIZE', 16744448) +COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION_SHIFT = _anonenum4.define('COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION_SHIFT', 24) +COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION_WIDTH = _anonenum4.define('COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION_WIDTH', 1) +COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION = _anonenum4.define('COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION', 16777216) +COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE_SHIFT = _anonenum4.define('COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE_SHIFT', 25) +COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE_WIDTH = _anonenum4.define('COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE_WIDTH', 1) +COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE = _anonenum4.define('COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE', 33554432) +COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO_SHIFT = _anonenum4.define('COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO_SHIFT', 26) +COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO_WIDTH = _anonenum4.define('COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO_WIDTH', 1) +COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO = _anonenum4.define('COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO', 67108864) +COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW_SHIFT = _anonenum4.define('COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW_SHIFT', 27) +COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW_WIDTH = _anonenum4.define('COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW_WIDTH', 1) +COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW = _anonenum4.define('COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW', 134217728) +COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW_SHIFT = _anonenum4.define('COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW_SHIFT', 28) +COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW_WIDTH = _anonenum4.define('COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW_WIDTH', 1) +COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW = _anonenum4.define('COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW', 268435456) +COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT_SHIFT = _anonenum4.define('COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT_SHIFT', 29) +COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT_WIDTH = _anonenum4.define('COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT_WIDTH', 1) +COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT = _anonenum4.define('COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT', 536870912) +COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_INT_DIVIDE_BY_ZERO_SHIFT = _anonenum4.define('COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_INT_DIVIDE_BY_ZERO_SHIFT', 30) +COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_INT_DIVIDE_BY_ZERO_WIDTH = _anonenum4.define('COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_INT_DIVIDE_BY_ZERO_WIDTH', 1) +COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_INT_DIVIDE_BY_ZERO = _anonenum4.define('COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_INT_DIVIDE_BY_ZERO', 1073741824) +COMPUTE_PGM_RSRC2_RESERVED0_SHIFT = _anonenum4.define('COMPUTE_PGM_RSRC2_RESERVED0_SHIFT', 31) +COMPUTE_PGM_RSRC2_RESERVED0_WIDTH = _anonenum4.define('COMPUTE_PGM_RSRC2_RESERVED0_WIDTH', 1) +COMPUTE_PGM_RSRC2_RESERVED0 = _anonenum4.define('COMPUTE_PGM_RSRC2_RESERVED0', -2147483648) + +_anonenum5 = CEnum(int32_t) +COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT = _anonenum5.define('COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT', 0) +COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_WIDTH = _anonenum5.define('COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_WIDTH', 6) +COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET = _anonenum5.define('COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET', 63) +COMPUTE_PGM_RSRC3_GFX90A_RESERVED0_SHIFT = _anonenum5.define('COMPUTE_PGM_RSRC3_GFX90A_RESERVED0_SHIFT', 6) +COMPUTE_PGM_RSRC3_GFX90A_RESERVED0_WIDTH = _anonenum5.define('COMPUTE_PGM_RSRC3_GFX90A_RESERVED0_WIDTH', 10) +COMPUTE_PGM_RSRC3_GFX90A_RESERVED0 = _anonenum5.define('COMPUTE_PGM_RSRC3_GFX90A_RESERVED0', 65472) +COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT = _anonenum5.define('COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT', 16) +COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_WIDTH = _anonenum5.define('COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_WIDTH', 1) +COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT = _anonenum5.define('COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT', 65536) +COMPUTE_PGM_RSRC3_GFX90A_RESERVED1_SHIFT = _anonenum5.define('COMPUTE_PGM_RSRC3_GFX90A_RESERVED1_SHIFT', 17) +COMPUTE_PGM_RSRC3_GFX90A_RESERVED1_WIDTH = _anonenum5.define('COMPUTE_PGM_RSRC3_GFX90A_RESERVED1_WIDTH', 15) +COMPUTE_PGM_RSRC3_GFX90A_RESERVED1 = _anonenum5.define('COMPUTE_PGM_RSRC3_GFX90A_RESERVED1', -131072) + +_anonenum6 = CEnum(int32_t) +COMPUTE_PGM_RSRC3_GFX10_GFX11_SHARED_VGPR_COUNT_SHIFT = _anonenum6.define('COMPUTE_PGM_RSRC3_GFX10_GFX11_SHARED_VGPR_COUNT_SHIFT', 0) +COMPUTE_PGM_RSRC3_GFX10_GFX11_SHARED_VGPR_COUNT_WIDTH = _anonenum6.define('COMPUTE_PGM_RSRC3_GFX10_GFX11_SHARED_VGPR_COUNT_WIDTH', 4) +COMPUTE_PGM_RSRC3_GFX10_GFX11_SHARED_VGPR_COUNT = _anonenum6.define('COMPUTE_PGM_RSRC3_GFX10_GFX11_SHARED_VGPR_COUNT', 15) +COMPUTE_PGM_RSRC3_GFX12_PLUS_RESERVED0_SHIFT = _anonenum6.define('COMPUTE_PGM_RSRC3_GFX12_PLUS_RESERVED0_SHIFT', 0) +COMPUTE_PGM_RSRC3_GFX12_PLUS_RESERVED0_WIDTH = _anonenum6.define('COMPUTE_PGM_RSRC3_GFX12_PLUS_RESERVED0_WIDTH', 4) +COMPUTE_PGM_RSRC3_GFX12_PLUS_RESERVED0 = _anonenum6.define('COMPUTE_PGM_RSRC3_GFX12_PLUS_RESERVED0', 15) +COMPUTE_PGM_RSRC3_GFX10_RESERVED1_SHIFT = _anonenum6.define('COMPUTE_PGM_RSRC3_GFX10_RESERVED1_SHIFT', 4) +COMPUTE_PGM_RSRC3_GFX10_RESERVED1_WIDTH = _anonenum6.define('COMPUTE_PGM_RSRC3_GFX10_RESERVED1_WIDTH', 8) +COMPUTE_PGM_RSRC3_GFX10_RESERVED1 = _anonenum6.define('COMPUTE_PGM_RSRC3_GFX10_RESERVED1', 4080) +COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_SHIFT = _anonenum6.define('COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_SHIFT', 4) +COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_WIDTH = _anonenum6.define('COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_WIDTH', 6) +COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE = _anonenum6.define('COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE', 1008) +COMPUTE_PGM_RSRC3_GFX11_TRAP_ON_START_SHIFT = _anonenum6.define('COMPUTE_PGM_RSRC3_GFX11_TRAP_ON_START_SHIFT', 10) +COMPUTE_PGM_RSRC3_GFX11_TRAP_ON_START_WIDTH = _anonenum6.define('COMPUTE_PGM_RSRC3_GFX11_TRAP_ON_START_WIDTH', 1) +COMPUTE_PGM_RSRC3_GFX11_TRAP_ON_START = _anonenum6.define('COMPUTE_PGM_RSRC3_GFX11_TRAP_ON_START', 1024) +COMPUTE_PGM_RSRC3_GFX11_TRAP_ON_END_SHIFT = _anonenum6.define('COMPUTE_PGM_RSRC3_GFX11_TRAP_ON_END_SHIFT', 11) +COMPUTE_PGM_RSRC3_GFX11_TRAP_ON_END_WIDTH = _anonenum6.define('COMPUTE_PGM_RSRC3_GFX11_TRAP_ON_END_WIDTH', 1) +COMPUTE_PGM_RSRC3_GFX11_TRAP_ON_END = _anonenum6.define('COMPUTE_PGM_RSRC3_GFX11_TRAP_ON_END', 2048) +COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_SHIFT = _anonenum6.define('COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_SHIFT', 4) +COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_WIDTH = _anonenum6.define('COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_WIDTH', 8) +COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE = _anonenum6.define('COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE', 4080) +COMPUTE_PGM_RSRC3_GFX10_PLUS_RESERVED2_SHIFT = _anonenum6.define('COMPUTE_PGM_RSRC3_GFX10_PLUS_RESERVED2_SHIFT', 12) +COMPUTE_PGM_RSRC3_GFX10_PLUS_RESERVED2_WIDTH = _anonenum6.define('COMPUTE_PGM_RSRC3_GFX10_PLUS_RESERVED2_WIDTH', 1) +COMPUTE_PGM_RSRC3_GFX10_PLUS_RESERVED2 = _anonenum6.define('COMPUTE_PGM_RSRC3_GFX10_PLUS_RESERVED2', 4096) +COMPUTE_PGM_RSRC3_GFX10_GFX11_RESERVED3_SHIFT = _anonenum6.define('COMPUTE_PGM_RSRC3_GFX10_GFX11_RESERVED3_SHIFT', 13) +COMPUTE_PGM_RSRC3_GFX10_GFX11_RESERVED3_WIDTH = _anonenum6.define('COMPUTE_PGM_RSRC3_GFX10_GFX11_RESERVED3_WIDTH', 1) +COMPUTE_PGM_RSRC3_GFX10_GFX11_RESERVED3 = _anonenum6.define('COMPUTE_PGM_RSRC3_GFX10_GFX11_RESERVED3', 8192) +COMPUTE_PGM_RSRC3_GFX12_PLUS_GLG_EN_SHIFT = _anonenum6.define('COMPUTE_PGM_RSRC3_GFX12_PLUS_GLG_EN_SHIFT', 13) +COMPUTE_PGM_RSRC3_GFX12_PLUS_GLG_EN_WIDTH = _anonenum6.define('COMPUTE_PGM_RSRC3_GFX12_PLUS_GLG_EN_WIDTH', 1) +COMPUTE_PGM_RSRC3_GFX12_PLUS_GLG_EN = _anonenum6.define('COMPUTE_PGM_RSRC3_GFX12_PLUS_GLG_EN', 8192) +COMPUTE_PGM_RSRC3_GFX10_PLUS_RESERVED4_SHIFT = _anonenum6.define('COMPUTE_PGM_RSRC3_GFX10_PLUS_RESERVED4_SHIFT', 14) +COMPUTE_PGM_RSRC3_GFX10_PLUS_RESERVED4_WIDTH = _anonenum6.define('COMPUTE_PGM_RSRC3_GFX10_PLUS_RESERVED4_WIDTH', 17) +COMPUTE_PGM_RSRC3_GFX10_PLUS_RESERVED4 = _anonenum6.define('COMPUTE_PGM_RSRC3_GFX10_PLUS_RESERVED4', 2147467264) +COMPUTE_PGM_RSRC3_GFX10_RESERVED5_SHIFT = _anonenum6.define('COMPUTE_PGM_RSRC3_GFX10_RESERVED5_SHIFT', 31) +COMPUTE_PGM_RSRC3_GFX10_RESERVED5_WIDTH = _anonenum6.define('COMPUTE_PGM_RSRC3_GFX10_RESERVED5_WIDTH', 1) +COMPUTE_PGM_RSRC3_GFX10_RESERVED5 = _anonenum6.define('COMPUTE_PGM_RSRC3_GFX10_RESERVED5', -2147483648) +COMPUTE_PGM_RSRC3_GFX11_PLUS_IMAGE_OP_SHIFT = _anonenum6.define('COMPUTE_PGM_RSRC3_GFX11_PLUS_IMAGE_OP_SHIFT', 31) +COMPUTE_PGM_RSRC3_GFX11_PLUS_IMAGE_OP_WIDTH = _anonenum6.define('COMPUTE_PGM_RSRC3_GFX11_PLUS_IMAGE_OP_WIDTH', 1) +COMPUTE_PGM_RSRC3_GFX11_PLUS_IMAGE_OP = _anonenum6.define('COMPUTE_PGM_RSRC3_GFX11_PLUS_IMAGE_OP', -2147483648) + +_anonenum7 = CEnum(int32_t) +KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT = _anonenum7.define('KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT', 0) +KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH = _anonenum7.define('KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH', 1) +KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER = _anonenum7.define('KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER', 1) +KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT = _anonenum7.define('KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT', 1) +KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH = _anonenum7.define('KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH', 1) +KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR = _anonenum7.define('KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR', 2) +KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT = _anonenum7.define('KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT', 2) +KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH = _anonenum7.define('KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH', 1) +KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR = _anonenum7.define('KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR', 4) +KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT = _anonenum7.define('KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT', 3) +KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH = _anonenum7.define('KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH', 1) +KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR = _anonenum7.define('KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR', 8) +KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT = _anonenum7.define('KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT', 4) +KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH = _anonenum7.define('KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH', 1) +KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID = _anonenum7.define('KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID', 16) +KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT = _anonenum7.define('KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT', 5) +KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH = _anonenum7.define('KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH', 1) +KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT = _anonenum7.define('KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT', 32) +KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT = _anonenum7.define('KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT', 6) +KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH = _anonenum7.define('KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH', 1) +KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE = _anonenum7.define('KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE', 64) +KERNEL_CODE_PROPERTY_RESERVED0_SHIFT = _anonenum7.define('KERNEL_CODE_PROPERTY_RESERVED0_SHIFT', 7) +KERNEL_CODE_PROPERTY_RESERVED0_WIDTH = _anonenum7.define('KERNEL_CODE_PROPERTY_RESERVED0_WIDTH', 3) +KERNEL_CODE_PROPERTY_RESERVED0 = _anonenum7.define('KERNEL_CODE_PROPERTY_RESERVED0', 896) +KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32_SHIFT = _anonenum7.define('KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32_SHIFT', 10) +KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32_WIDTH = _anonenum7.define('KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32_WIDTH', 1) +KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32 = _anonenum7.define('KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32', 1024) +KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK_SHIFT = _anonenum7.define('KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK_SHIFT', 11) +KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK_WIDTH = _anonenum7.define('KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK_WIDTH', 1) +KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK = _anonenum7.define('KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK', 2048) +KERNEL_CODE_PROPERTY_RESERVED1_SHIFT = _anonenum7.define('KERNEL_CODE_PROPERTY_RESERVED1_SHIFT', 12) +KERNEL_CODE_PROPERTY_RESERVED1_WIDTH = _anonenum7.define('KERNEL_CODE_PROPERTY_RESERVED1_WIDTH', 4) +KERNEL_CODE_PROPERTY_RESERVED1 = _anonenum7.define('KERNEL_CODE_PROPERTY_RESERVED1', 61440) + +_anonenum8 = CEnum(int32_t) +KERNARG_PRELOAD_SPEC_LENGTH_SHIFT = _anonenum8.define('KERNARG_PRELOAD_SPEC_LENGTH_SHIFT', 0) +KERNARG_PRELOAD_SPEC_LENGTH_WIDTH = _anonenum8.define('KERNARG_PRELOAD_SPEC_LENGTH_WIDTH', 7) +KERNARG_PRELOAD_SPEC_LENGTH = _anonenum8.define('KERNARG_PRELOAD_SPEC_LENGTH', 127) +KERNARG_PRELOAD_SPEC_OFFSET_SHIFT = _anonenum8.define('KERNARG_PRELOAD_SPEC_OFFSET_SHIFT', 7) +KERNARG_PRELOAD_SPEC_OFFSET_WIDTH = _anonenum8.define('KERNARG_PRELOAD_SPEC_OFFSET_WIDTH', 9) +KERNARG_PRELOAD_SPEC_OFFSET = _anonenum8.define('KERNARG_PRELOAD_SPEC_OFFSET', 65408) + +class llvm_amdhsa_kernel_descriptor_t(Struct): pass +uint32_t = ctypes.c_uint32 +int64_t = ctypes.c_int64 +uint16_t = ctypes.c_uint16 +llvm_amdhsa_kernel_descriptor_t._fields_ = [ + ('group_segment_fixed_size', uint32_t), + ('private_segment_fixed_size', uint32_t), + ('kernarg_size', uint32_t), + ('reserved0', (uint8_t * 4)), + ('kernel_code_entry_byte_offset', int64_t), + ('reserved1', (uint8_t * 20)), + ('compute_pgm_rsrc3', uint32_t), + ('compute_pgm_rsrc1', uint32_t), + ('compute_pgm_rsrc2', uint32_t), + ('kernel_code_properties', uint16_t), + ('kernarg_preload', uint16_t), + ('reserved3', (uint8_t * 4)), +] +_anonenum9 = CEnum(uint32_t) +GROUP_SEGMENT_FIXED_SIZE_OFFSET = _anonenum9.define('GROUP_SEGMENT_FIXED_SIZE_OFFSET', 0) +PRIVATE_SEGMENT_FIXED_SIZE_OFFSET = _anonenum9.define('PRIVATE_SEGMENT_FIXED_SIZE_OFFSET', 4) +KERNARG_SIZE_OFFSET = _anonenum9.define('KERNARG_SIZE_OFFSET', 8) +RESERVED0_OFFSET = _anonenum9.define('RESERVED0_OFFSET', 12) +KERNEL_CODE_ENTRY_BYTE_OFFSET_OFFSET = _anonenum9.define('KERNEL_CODE_ENTRY_BYTE_OFFSET_OFFSET', 16) +RESERVED1_OFFSET = _anonenum9.define('RESERVED1_OFFSET', 24) +COMPUTE_PGM_RSRC3_OFFSET = _anonenum9.define('COMPUTE_PGM_RSRC3_OFFSET', 44) +COMPUTE_PGM_RSRC1_OFFSET = _anonenum9.define('COMPUTE_PGM_RSRC1_OFFSET', 48) +COMPUTE_PGM_RSRC2_OFFSET = _anonenum9.define('COMPUTE_PGM_RSRC2_OFFSET', 52) +KERNEL_CODE_PROPERTIES_OFFSET = _anonenum9.define('KERNEL_CODE_PROPERTIES_OFFSET', 56) +KERNARG_PRELOAD_OFFSET = _anonenum9.define('KERNARG_PRELOAD_OFFSET', 58) +RESERVED3_OFFSET = _anonenum9.define('RESERVED3_OFFSET', 60) + diff --git a/tinygrad/runtime/ops_amd.py b/tinygrad/runtime/ops_amd.py index dcb81b3fa4..57f1a38c3a 100644 --- a/tinygrad/runtime/ops_amd.py +++ b/tinygrad/runtime/ops_amd.py @@ -11,7 +11,7 @@ from tinygrad.helpers import getenv, round_up, data64_le, DEBUG, PROFILE, Profil from tinygrad.helpers import VIZ, AMD_CC, AMD_LLVM, ceildiv from tinygrad.renderer.cstyle import AMDHIPRenderer, AMDHIPCCRenderer from tinygrad.renderer.llvmir import AMDLLVMRenderer -from tinygrad.runtime.autogen import kfd, hsa, pci, sqtt +from tinygrad.runtime.autogen import kfd, hsa, pci, sqtt, amdgpu_kd from tinygrad.runtime.autogen.am import am from tinygrad.runtime.support.elf import elf_loader from tinygrad.runtime.support.am.amdev import AMDev, AMMemoryManager @@ -557,29 +557,29 @@ class AMDProgram(HCQProgram): self.dev.allocator._copyin(self.lib_gpu, image) self.dev.synchronize() - self.group_segment_size = image[rodata_entry:rodata_entry+4].cast("I")[0] - self.private_segment_size = image[rodata_entry+4:rodata_entry+8].cast("I")[0] - self.kernargs_segment_size = image[rodata_entry+8:rodata_entry+12].cast("I")[0] + desc_sz = ctypes.sizeof(amdgpu_kd.llvm_amdhsa_kernel_descriptor_t) + desc = amdgpu_kd.llvm_amdhsa_kernel_descriptor_t.from_buffer_copy(bytes(image[rodata_entry:rodata_entry+desc_sz])) + self.group_segment_size = desc.group_segment_fixed_size + self.private_segment_size = desc.private_segment_fixed_size + self.kernargs_segment_size = desc.kernarg_size lds_size = ((self.group_segment_size + 511) // 512) & 0x1FF if lds_size > (self.dev.iface.props['lds_size_in_kb'] * 1024) // 512: raise RuntimeError("Too many resources requested: group_segment_size") # Ensure scratch size self.dev._ensure_has_local_memory(self.private_segment_size) - # NOTE: this is wrong, it's not this object. pad it, since it might be smaller than the struct - code = hsa.amd_kernel_code_t.from_buffer_copy(bytes(image[rodata_entry:rodata_entry+256]) + b'\x00'*256) - self.wave32: bool = code.kernel_code_properties & 0x400 == 0x400 + self.wave32: bool = desc.kernel_code_properties & 0x400 == 0x400 # Set rsrc1.priv=1 on gfx11 to workaround cwsr. - self.rsrc1: int = code.compute_pgm_rsrc1 | ((1 << 20) if (11,0,0) <= self.dev.target < (12,0,0) else 0) - self.rsrc2: int = code.compute_pgm_rsrc2 | (lds_size << 15) - self.rsrc3: int = image[rodata_entry+44:rodata_entry+48].cast("I")[0] # NOTE: kernel descriptor, not in amd_kernel_code_t struct + self.rsrc1: int = desc.compute_pgm_rsrc1 | ((1 << 20) if (11,0,0) <= self.dev.target < (12,0,0) else 0) + self.rsrc2: int = desc.compute_pgm_rsrc2 | (lds_size << 15) + self.rsrc3: int = desc.compute_pgm_rsrc3 self.aql_prog_addr: int = self.lib_gpu.va_addr + rodata_entry - self.prog_addr: int = self.lib_gpu.va_addr + rodata_entry + code.kernel_code_entry_byte_offset + self.prog_addr: int = self.lib_gpu.va_addr + rodata_entry + desc.kernel_code_entry_byte_offset # Some programs use hsa_kernel_dispatch_packet_t to read workgroup sizes during execution. # The packet is represented as a pointer and set up in SGPRs. Space for the packet is allocated as part of the kernel arguments. - self.enable_dispatch_ptr: int = code.kernel_code_properties & hsa.AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR - self.enable_private_segment_sgpr: int = code.kernel_code_properties & hsa.AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER + self.enable_dispatch_ptr: int = desc.kernel_code_properties & hsa.AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR + self.enable_private_segment_sgpr: int = desc.kernel_code_properties & hsa.AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER additional_alloc_sz = ctypes.sizeof(hsa.hsa_kernel_dispatch_packet_t) if self.enable_dispatch_ptr else 0 if dev.sqtt_enabled: self.libhash: tuple[int, int] = struct.unpack('