kfd driver wip (#3912)

* kfd driver wip

* cleanups

* kfd almost ready to ring doorbell

* ding dong?

* issues with signals

* something

* works

* ops kfd

* add amd_signal_t

* works...sometimes

* program runs

* _gpu_alloc cleanup

* cleanups

* work

* header + enable profiling (#3959)

* header + enable profiling

* just cleaner

* measure

* only local time domain

* remove old comments

* fix with master

* elf parsing (#3965)

* elf parsing

* fix kernels with private

* not used

* clean up

* clean up 2

* add flags

* kfd sdma (#3970)

* working sdma

* remove driver, shorter

* all commands we might need

* svm

* kfd remove hardcoded values (#4007)

* remove hardcoded values

* match above line

* 7k lines + revert hsa

* update that from origin

* fix sdma reg gen

* not the updated SDMA

* compiler_opts

* don't require kfd_ioctl

* get ioctls from python

* get ioctls from python

* remove build_sdma_command

* merge into 64-bit fields

* shorter

* fix property spelling and off by one

---------

Co-authored-by: nimlgen <138685161+nimlgen@users.noreply.github.com>
This commit is contained in:
George Hotz
2024-03-30 15:08:12 -07:00
committed by GitHub
parent bee8eeae55
commit 2abb474d43
11 changed files with 3527 additions and 3040 deletions

View File

@@ -112,8 +112,8 @@ jobs:
python -c "from tinygrad.tensor import Tensor; print(Tensor([1,2,3,4,5]))"
- name: Test DEBUG
run: DEBUG=100 python3 -c "from tinygrad import Tensor; N = 1024; a, b = Tensor.rand(N, N), Tensor.rand(N, N); c = (a.reshape(N, 1, N) * b.T.reshape(1, N, N)).sum(axis=2); print((c.numpy() - (a.numpy() @ b.numpy())).mean())"
- name: Repo line count <6500 lines
run: MAX_LINE_COUNT=6500 python sz.py
- name: Repo line count <7000 lines
run: MAX_LINE_COUNT=7000 python sz.py
testcpuimagenet:
name: ImageNet to C Tests

View File

@@ -56,6 +56,13 @@ generate_comgr() {
python3 -c "import tinygrad.runtime.autogen.comgr"
}
generate_kfd() {
clang2py /usr/include/linux/kfd_ioctl.h -o $BASE/kfd.py -k cdefstum
fixup $BASE/kfd.py
sed -i "s\import ctypes\import ctypes, os\g" $BASE/kfd.py
python3 -c "import tinygrad.runtime.autogen.kfd"
}
generate_cuda() {
clang2py /usr/include/cuda.h /usr/include/nvrtc.h -o $BASE/cuda.py -l /usr/lib/x86_64-linux-gnu/libcuda.so -l /usr/lib/x86_64-linux-gnu/libnvrtc.so
sed -i "s\import ctypes\import ctypes, ctypes.util\g" $BASE/cuda.py
@@ -69,10 +76,19 @@ generate_hsa() {
clang2py \
/opt/rocm/include/hsa/hsa.h \
/opt/rocm/include/hsa/hsa_ext_amd.h \
/opt/rocm/include/hsa/amd_hsa_signal.h \
/opt/rocm/include/hsa/amd_hsa_queue.h \
/opt/rocm/include/hsa/hsa_ext_finalize.h /opt/rocm/include/hsa/hsa_ext_image.h \
--clang-args="-I/opt/rocm/include" \
-o $BASE/hsa.py -l /opt/rocm/lib/libhsa-runtime64.so
# clang2py broken when pass -x c++ to prev headers
clang2py extra/hip_gpu_driver/sdma_registers.h \
--clang-args="-I/opt/rocm/include -x c++" \
-o $BASE/amd_sdma.py -l /opt/rocm/lib/libhsa-runtime64.so
fixup $BASE/hsa.py
fixup $BASE/amd_sdma.py
sed -i "s\import ctypes\import ctypes, os\g" $BASE/hsa.py
sed -i "s\'/opt/rocm/\os.getenv('ROCM_PATH', '/opt/rocm/')+'/\g" $BASE/hsa.py
python3 -c "import tinygrad.runtime.autogen.hsa"
@@ -83,6 +99,7 @@ elif [ "$1" == "hip" ]; then generate_hip
elif [ "$1" == "comgr" ]; then generate_comgr
elif [ "$1" == "cuda" ]; then generate_cuda
elif [ "$1" == "hsa" ]; then generate_hsa
elif [ "$1" == "all" ]; then generate_opencl; generate_hip; generate_comgr; generate_cuda; generate_hsa
elif [ "$1" == "kfd" ]; then generate_kfd
elif [ "$1" == "all" ]; then generate_opencl; generate_hip; generate_comgr; generate_cuda; generate_hsa; generate_kfd
else echo "usage: $0 <type>"
fi

View File

@@ -44,10 +44,9 @@ def install_hook(c_function, python_function):
# *** ioctl lib end ***
# clang2py kfd_ioctl.h -o kfd_ioctl.py
from extra.hip_gpu_driver import kfd_ioctl
import tinygrad.runtime.autogen.kfd as kfd_ioctl
def ioctls_from_header():
hdr = (pathlib.Path(__file__).parent.parent.parent / "extra/hip_gpu_driver/kfd_ioctl.h").read_text().replace("\\\n", "")
hdr = pathlib.Path("/usr/include/linux/kfd_ioctl.h").read_text().replace("\\\n", "")
pattern = r'#define\s+(AMDKFD_IOC_[A-Z0-9_]+)\s+AMDKFD_IOW?R?\((0x[0-9a-fA-F]+),\s+struct\s([A-Za-z0-9_]+)\)'
matches = re.findall(pattern, hdr, re.MULTILINE)
return {int(nr, 0x10):(name, getattr(kfd_ioctl, "struct_"+sname)) for name, nr, sname in matches}
@@ -68,7 +67,8 @@ def ioctl(fd, request, argp):
out = ctypes.cast(s.attrs, ctypes.POINTER(kfd_ioctl.struct_kfd_ioctl_svm_attribute))
for i in range(s.nattr): print(f"{i}: {kfd_ioctl.kfd_ioctl_svm_attr_type__enumvalues[out[i].type]:40s}: {out[i].value:#x}")
else:
print("ioctl", f"{idir=} {size=} {itype=} {nr=} {fd=} {ret=}", os.readlink(f"/proc/self/fd/{fd}") if fd >= 0 else "")
print(f"{(st-start)*1000:7.2f} ms +{et*1000.:7.2f} ms : ioctl",
f"{idir=} {size=} {itype=} {nr=} {fd=} {ret=}", os.readlink(f"/proc/self/fd/{fd}") if fd >= 0 else "")
return ret
install_hook(libc.ioctl, ioctl)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1 @@
/usr/include/linux/kfd_ioctl.h

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,571 @@
////////////////////////////////////////////////////////////////////////////////
//
// The University of Illinois/NCSA
// Open Source License (NCSA)
//
// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
//
// Developed by:
//
// AMD Research and AMD HSA Software Development
//
// Advanced Micro Devices, Inc.
//
// www.amd.com
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal with the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// - Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimers.
// - Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimers in
// the documentation and/or other materials provided with the distribution.
// - Neither the names of Advanced Micro Devices, Inc,
// nor the names of its contributors may be used to endorse or promote
// products derived from this Software without specific prior written
// permission.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS WITH THE SOFTWARE.
//
////////////////////////////////////////////////////////////////////////////////
#ifndef HSA_RUNTIME_CORE_INC_SDMA_REGISTERS_H_
#define HSA_RUNTIME_CORE_INC_SDMA_REGISTERS_H_
#include <stddef.h>
#include <stdint.h>
namespace rocr {
namespace AMD {
// SDMA packet for VI device.
// Reference: http://people.freedesktop.org/~agd5f/dma_packets.txt
const unsigned int SDMA_OP_COPY = 1;
const unsigned int SDMA_OP_FENCE = 5;
const unsigned int SDMA_OP_TRAP = 6;
const unsigned int SDMA_OP_POLL_REGMEM = 8;
const unsigned int SDMA_OP_ATOMIC = 10;
const unsigned int SDMA_OP_CONST_FILL = 11;
const unsigned int SDMA_OP_TIMESTAMP = 13;
const unsigned int SDMA_OP_GCR = 17;
const unsigned int SDMA_SUBOP_COPY_LINEAR = 0;
const unsigned int SDMA_SUBOP_COPY_LINEAR_RECT = 4;
const unsigned int SDMA_SUBOP_TIMESTAMP_GET_GLOBAL = 2;
const unsigned int SDMA_SUBOP_USER_GCR = 1;
const unsigned int SDMA_ATOMIC_ADD64 = 47;
typedef struct SDMA_PKT_COPY_LINEAR_TAG {
union {
struct {
unsigned int op : 8;
unsigned int sub_op : 8;
unsigned int extra_info : 16;
};
unsigned int DW_0_DATA;
} HEADER_UNION;
union {
struct {
unsigned int count : 22;
unsigned int reserved_0 : 10;
};
unsigned int DW_1_DATA;
} COUNT_UNION;
union {
struct {
unsigned int reserved_0 : 16;
unsigned int dst_swap : 2;
unsigned int reserved_1 : 6;
unsigned int src_swap : 2;
unsigned int reserved_2 : 6;
};
unsigned int DW_2_DATA;
} PARAMETER_UNION;
union {
struct {
unsigned int src_addr_31_0 : 32;
};
unsigned int DW_3_DATA;
} SRC_ADDR_LO_UNION;
union {
struct {
unsigned int src_addr_63_32 : 32;
};
unsigned int DW_4_DATA;
} SRC_ADDR_HI_UNION;
union {
struct {
unsigned int dst_addr_31_0 : 32;
};
unsigned int DW_5_DATA;
} DST_ADDR_LO_UNION;
union {
struct {
unsigned int dst_addr_63_32 : 32;
};
unsigned int DW_6_DATA;
} DST_ADDR_HI_UNION;
static const size_t kMaxSize_ = 0x3fffe0;
} SDMA_PKT_COPY_LINEAR;
// linear sub-window
typedef struct SDMA_PKT_COPY_LINEAR_RECT_TAG {
static const unsigned int pitch_bits = 19;
static const unsigned int slice_bits = 28;
static const unsigned int rect_xy_bits = 14;
static const unsigned int rect_z_bits = 11;
union {
struct {
unsigned int op : 8;
unsigned int sub_op : 8;
unsigned int reserved : 13;
unsigned int element : 3;
};
unsigned int DW_0_DATA;
} HEADER_UNION;
union {
struct {
unsigned int src_addr_31_0 : 32;
};
unsigned int DW_1_DATA;
} SRC_ADDR_LO_UNION;
union {
struct {
unsigned int src_addr_63_32 : 32;
};
unsigned int DW_2_DATA;
} SRC_ADDR_HI_UNION;
union {
struct {
unsigned int src_offset_x : 14;
unsigned int reserved_1 : 2;
unsigned int src_offset_y : 14;
unsigned int reserved_2 : 2;
};
unsigned int DW_3_DATA;
} SRC_PARAMETER_1_UNION;
union {
struct {
unsigned int src_offset_z : 11;
unsigned int reserved_1 : 2;
unsigned int src_pitch : pitch_bits;
};
unsigned int DW_4_DATA;
} SRC_PARAMETER_2_UNION;
union {
struct {
unsigned int src_slice_pitch : slice_bits;
unsigned int reserved_1 : 4;
};
unsigned int DW_5_DATA;
} SRC_PARAMETER_3_UNION;
union {
struct {
unsigned int dst_addr_31_0 : 32;
};
unsigned int DW_6_DATA;
} DST_ADDR_LO_UNION;
union {
struct {
unsigned int dst_addr_63_32 : 32;
};
unsigned int DW_7_DATA;
} DST_ADDR_HI_UNION;
union {
struct {
unsigned int dst_offset_x : 14;
unsigned int reserved_1 : 2;
unsigned int dst_offset_y : 14;
unsigned int reserved_2 : 2;
};
unsigned int DW_8_DATA;
} DST_PARAMETER_1_UNION;
union {
struct {
unsigned int dst_offset_z : 11;
unsigned int reserved_1 : 2;
unsigned int dst_pitch : pitch_bits;
};
unsigned int DW_9_DATA;
} DST_PARAMETER_2_UNION;
union {
struct {
unsigned int dst_slice_pitch : slice_bits;
unsigned int reserved_1 : 4;
};
unsigned int DW_10_DATA;
} DST_PARAMETER_3_UNION;
union {
struct {
unsigned int rect_x : rect_xy_bits;
unsigned int reserved_1 : 2;
unsigned int rect_y : rect_xy_bits;
unsigned int reserved_2 : 2;
};
unsigned int DW_11_DATA;
} RECT_PARAMETER_1_UNION;
union {
struct {
unsigned int rect_z : rect_z_bits;
unsigned int reserved_1 : 5;
unsigned int dst_swap : 2;
unsigned int reserved_2 : 6;
unsigned int src_swap : 2;
unsigned int reserved_3 : 6;
};
unsigned int DW_12_DATA;
} RECT_PARAMETER_2_UNION;
} SDMA_PKT_COPY_LINEAR_RECT;
typedef struct SDMA_PKT_CONSTANT_FILL_TAG {
union {
struct {
unsigned int op : 8;
unsigned int sub_op : 8;
unsigned int sw : 2;
unsigned int reserved_0 : 12;
unsigned int fillsize : 2;
};
unsigned int DW_0_DATA;
} HEADER_UNION;
union {
struct {
unsigned int dst_addr_31_0 : 32;
};
unsigned int DW_1_DATA;
} DST_ADDR_LO_UNION;
union {
struct {
unsigned int dst_addr_63_32 : 32;
};
unsigned int DW_2_DATA;
} DST_ADDR_HI_UNION;
union {
struct {
unsigned int src_data_31_0 : 32;
};
unsigned int DW_3_DATA;
} DATA_UNION;
union {
struct {
unsigned int count : 22;
unsigned int reserved_0 : 10;
};
unsigned int DW_4_DATA;
} COUNT_UNION;
static const size_t kMaxSize_ = 0x3fffe0;
} SDMA_PKT_CONSTANT_FILL;
typedef struct SDMA_PKT_FENCE_TAG {
union {
struct {
unsigned int op : 8;
unsigned int sub_op : 8;
unsigned int mtype : 3;
unsigned int gcc : 1;
unsigned int sys : 1;
unsigned int pad1 : 1;
unsigned int snp : 1;
unsigned int gpa : 1;
unsigned int l2_policy : 2;
unsigned int reserved_0 : 6;
};
unsigned int DW_0_DATA;
} HEADER_UNION;
union {
struct {
unsigned int addr_31_0 : 32;
};
unsigned int DW_1_DATA;
} ADDR_LO_UNION;
union {
struct {
unsigned int addr_63_32 : 32;
};
unsigned int DW_2_DATA;
} ADDR_HI_UNION;
union {
struct {
unsigned int data : 32;
};
unsigned int DW_3_DATA;
} DATA_UNION;
} SDMA_PKT_FENCE;
typedef struct SDMA_PKT_POLL_REGMEM_TAG {
union {
struct {
unsigned int op : 8;
unsigned int sub_op : 8;
unsigned int reserved_0 : 10;
unsigned int hdp_flush : 1;
unsigned int reserved_1 : 1;
unsigned int func : 3;
unsigned int mem_poll : 1;
};
unsigned int DW_0_DATA;
} HEADER_UNION;
union {
struct {
unsigned int addr_31_0 : 32;
};
unsigned int DW_1_DATA;
} ADDR_LO_UNION;
union {
struct {
unsigned int addr_63_32 : 32;
};
unsigned int DW_2_DATA;
} ADDR_HI_UNION;
union {
struct {
unsigned int value : 32;
};
unsigned int DW_3_DATA;
} VALUE_UNION;
union {
struct {
unsigned int mask : 32;
};
unsigned int DW_4_DATA;
} MASK_UNION;
union {
struct {
unsigned int interval : 16;
unsigned int retry_count : 12;
unsigned int reserved_0 : 4;
};
unsigned int DW_5_DATA;
} DW5_UNION;
} SDMA_PKT_POLL_REGMEM;
typedef struct SDMA_PKT_ATOMIC_TAG {
union {
struct {
unsigned int op : 8;
unsigned int sub_op : 8;
unsigned int l : 1;
unsigned int reserved_0 : 8;
unsigned int operation : 7;
};
unsigned int DW_0_DATA;
} HEADER_UNION;
union {
struct {
unsigned int addr_31_0 : 32;
};
unsigned int DW_1_DATA;
} ADDR_LO_UNION;
union {
struct {
unsigned int addr_63_32 : 32;
};
unsigned int DW_2_DATA;
} ADDR_HI_UNION;
union {
struct {
unsigned int src_data_31_0 : 32;
};
unsigned int DW_3_DATA;
} SRC_DATA_LO_UNION;
union {
struct {
unsigned int src_data_63_32 : 32;
};
unsigned int DW_4_DATA;
} SRC_DATA_HI_UNION;
union {
struct {
unsigned int cmp_data_31_0 : 32;
};
unsigned int DW_5_DATA;
} CMP_DATA_LO_UNION;
union {
struct {
unsigned int cmp_data_63_32 : 32;
};
unsigned int DW_6_DATA;
} CMP_DATA_HI_UNION;
union {
struct {
unsigned int loop_interval : 13;
unsigned int reserved_0 : 19;
};
unsigned int DW_7_DATA;
} LOOP_UNION;
} SDMA_PKT_ATOMIC;
typedef struct SDMA_PKT_TIMESTAMP_TAG {
union {
struct {
unsigned int op : 8;
unsigned int sub_op : 8;
unsigned int reserved_0 : 16;
};
unsigned int DW_0_DATA;
} HEADER_UNION;
union {
struct {
unsigned int addr_31_0 : 32;
};
unsigned int DW_1_DATA;
} ADDR_LO_UNION;
union {
struct {
unsigned int addr_63_32 : 32;
};
unsigned int DW_2_DATA;
} ADDR_HI_UNION;
} SDMA_PKT_TIMESTAMP;
typedef struct SDMA_PKT_TRAP_TAG {
union {
struct {
unsigned int op : 8;
unsigned int sub_op : 8;
unsigned int reserved_0 : 16;
};
unsigned int DW_0_DATA;
} HEADER_UNION;
union {
struct {
unsigned int int_ctx : 28;
unsigned int reserved_1 : 4;
};
unsigned int DW_1_DATA;
} INT_CONTEXT_UNION;
} SDMA_PKT_TRAP;
// HDP flush packet, no parameters.
typedef struct SDMA_PKT_HDP_FLUSH_TAG {
unsigned int DW_0_DATA;
unsigned int DW_1_DATA;
unsigned int DW_2_DATA;
unsigned int DW_3_DATA;
unsigned int DW_4_DATA;
unsigned int DW_5_DATA;
// Version of gfx9 sDMA microcode introducing SDMA_PKT_HDP_FLUSH
static const uint16_t kMinVersion_ = 0x1A5;
} SDMA_PKT_HDP_FLUSH;
static const SDMA_PKT_HDP_FLUSH hdp_flush_cmd = {0x8, 0x0, 0x80000000, 0x0, 0x0, 0x0};
typedef struct SDMA_PKT_GCR_TAG {
union {
struct {
unsigned int op : 8;
unsigned int sub_op : 8;
unsigned int : 16;
};
unsigned int DW_0_DATA;
} HEADER_UNION;
union {
struct {
unsigned int : 7;
unsigned int BaseVA_LO : 25;
};
unsigned int DW_1_DATA;
} WORD1_UNION;
union {
struct {
unsigned int BaseVA_HI : 16;
unsigned int GCR_CONTROL_GLI_INV : 2;
unsigned int GCR_CONTROL_GL1_RANGE : 2;
unsigned int GCR_CONTROL_GLM_WB : 1;
unsigned int GCR_CONTROL_GLM_INV : 1;
unsigned int GCR_CONTROL_GLK_WB : 1;
unsigned int GCR_CONTROL_GLK_INV : 1;
unsigned int GCR_CONTROL_GLV_INV : 1;
unsigned int GCR_CONTROL_GL1_INV : 1;
unsigned int GCR_CONTROL_GL2_US : 1;
unsigned int GCR_CONTROL_GL2_RANGE : 2;
unsigned int GCR_CONTROL_GL2_DISCARD : 1;
unsigned int GCR_CONTROL_GL2_INV : 1;
unsigned int GCR_CONTROL_GL2_WB : 1;
};
unsigned int DW_2_DATA;
} WORD2_UNION;
union {
struct {
unsigned int GCR_CONTROL_RANGE_IS_PA : 1;
unsigned int GCR_CONTROL_SEQ : 2;
unsigned int : 4;
unsigned int LimitVA_LO : 25;
};
unsigned int DW_3_DATA;
} WORD3_UNION;
union {
struct {
unsigned int LimitVA_HI : 16;
unsigned int : 8;
unsigned int VMID : 4;
unsigned int : 4;
};
unsigned int DW_4_DATA;
} WORD4_UNION;
} SDMA_PKT_GCR;
} // namespace amd
} // namespace rocr
#endif // HSA_RUNTIME_CORE_INC_SDMA_REGISTERS_H_

View File

@@ -0,0 +1,207 @@
import os, ctypes, pathlib, re, fcntl, functools, mmap, time
import tinygrad.runtime.autogen.kfd as kfd
from tinygrad.helpers import to_mv, getenv
from extra.hip_gpu_driver import hip_ioctl
import tinygrad.runtime.autogen.hsa as hsa
from hexdump import hexdump
libc = ctypes.CDLL("libc.so.6")
libc.memset.argtypes = [ctypes.c_void_p, ctypes.c_char, ctypes.c_int]
libc.mmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long]
libc.mmap.restype = ctypes.c_void_p
MAP_NORESERVE = 0x4000
MAP_FIXED = 0x10
def kfd_ioctl(idir, nr, user_struct, fd, **kwargs):
made = user_struct(**kwargs)
ret = fcntl.ioctl(fd, (idir<<30) | (ctypes.sizeof(user_struct)<<16) | (ord('K')<<8) | nr, made)
if ret != 0: raise RuntimeError(f"ioctl returned {ret}")
return made
def format_struct(s):
sdats = []
for field_name, field_type in s._fields_:
dat = getattr(s, field_name)
if isinstance(dat, int): sdats.append(f"{field_name}:0x{dat:X}")
else: sdats.append(f"{field_name}:{dat}")
return sdats
idirs = {"IOW": 1, "IOR": 2, "IOWR": 3}
def ioctls_from_header():
hdr = pathlib.Path("/usr/include/linux/kfd_ioctl.h").read_text().replace("\\\n", "")
pattern = r'#define\s+(AMDKFD_IOC_[A-Z0-9_]+)\s+AMDKFD_(IOW?R?)\((0x[0-9a-fA-F]+),\s+struct\s([A-Za-z0-9_]+)\)'
matches = re.findall(pattern, hdr, re.MULTILINE)
fxns = {}
for name, idir, nr, sname in matches:
fxns[name.replace("AMDKFD_IOC_", "").lower()] = functools.partial(kfd_ioctl, idirs[idir], int(nr, 0x10), getattr(kfd, "struct_"+sname))
return type("KIO", (object, ), fxns)
kio = ioctls_from_header()
# sudo su -c "echo 'file drivers/gpu/drm/amd/* +p' > /sys/kernel/debug/dynamic_debug/control"
def gpu_alloc_userptr(fd, size, flags):
addr = libc.mmap(0, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED|mmap.MAP_ANONYMOUS, -1, 0)
assert addr != 0xffffffffffffffff
mem = kio.alloc_memory_of_gpu(fd, va_addr=addr, size=size, gpu_id=GPU_ID, flags=flags, mmap_offset=addr)
return mem
def gpu_alloc(fd, size, flags):
addr = libc.mmap(0, size, 0, mmap.MAP_PRIVATE|mmap.MAP_ANONYMOUS|MAP_NORESERVE, -1, 0)
assert addr != 0xffffffffffffffff
mem = kio.alloc_memory_of_gpu(fd, va_addr=addr, size=size, gpu_id=GPU_ID, flags=flags)
buf = libc.mmap(mem.va_addr, mem.size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED|MAP_FIXED, drm_fd, mem.mmap_offset)
assert buf != 0xffffffffffffffff
assert addr == buf == mem.va_addr
return mem
if __name__ == "__main__":
fd = os.open("/dev/kfd", os.O_RDWR)
gpu_num = getenv("GPU", 0)
drm_fd = os.open(f"/dev/dri/renderD{128+gpu_num}", os.O_RDWR)
with open(f"/sys/devices/virtual/kfd/kfd/topology/nodes/{1+gpu_num}/gpu_id", "r") as f: GPU_ID = int(f.read())
#ver = kio.get_version(fd)
st = kio.acquire_vm(fd, drm_fd=drm_fd, gpu_id=GPU_ID)
#exit(0)
# 0xF0000001 = KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC | KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE
# 0xD6000002 = KFD_IOC_ALLOC_MEM_FLAGS_GTT | KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE
# 0xD6000004 = KFD_IOC_ALLOC_MEM_FLAGS_USERPTR | KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE
# 0x94000010 = KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP | KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE
#addr = libc.mmap(0, 0x1000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_PRIVATE|mmap.MAP_ANONYMOUS, -1, 0)
#addr = libc.mmap(0, 0x1000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED|mmap.MAP_ANONYMOUS, -1, 0)
#mem = kio.AMDKFD_IOC_ALLOC_MEMORY_OF_GPU(fd, va_addr=addr, size=0x1000, gpu_id=GPU_ID, flags=0xD6000004)
#mem = gpu_alloc(fd, 0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM |
# kfd.KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE |
# kfd.KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC | kfd.KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE)
#arr = (ctypes.c_int32 * 1)(GPU_ID)
#stm = kio.map_memory_to_gpu(fd, handle=mem.handle, device_ids_array_ptr=ctypes.addressof(arr), n_devices=1)
arr = (ctypes.c_int32 * 1)(GPU_ID)
rw_ptr = gpu_alloc(fd, 0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT | kfd.KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE |
kfd.KFD_IOC_ALLOC_MEM_FLAGS_COHERENT | kfd.KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED |
kfd.KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE)
stm = kio.map_memory_to_gpu(fd, handle=rw_ptr.handle, device_ids_array_ptr=ctypes.addressof(arr), n_devices=1)
assert stm.n_success == 1
event_page = gpu_alloc(fd, 0x8000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT | kfd.KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE |
kfd.KFD_IOC_ALLOC_MEM_FLAGS_COHERENT | kfd.KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED |
kfd.KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE)
stm = kio.map_memory_to_gpu(fd, handle=event_page.handle, device_ids_array_ptr=ctypes.addressof(arr), n_devices=1)
assert stm.n_success == 1
ring_base = gpu_alloc_userptr(fd, 0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR | kfd.KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE |
kfd.KFD_IOC_ALLOC_MEM_FLAGS_COHERENT | kfd.KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED |
kfd.KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE)
stm = kio.map_memory_to_gpu(fd, handle=ring_base.handle, device_ids_array_ptr=ctypes.addressof(arr), n_devices=1)
assert stm.n_success == 1
signals = gpu_alloc_userptr(fd, 0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR | kfd.KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE |
kfd.KFD_IOC_ALLOC_MEM_FLAGS_COHERENT | kfd.KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED |
kfd.KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE)
stm = kio.map_memory_to_gpu(fd, handle=signals.handle, device_ids_array_ptr=ctypes.addressof(arr), n_devices=1)
assert stm.n_success == 1
eop_buffer = gpu_alloc(fd, 0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM |
kfd.KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE |
kfd.KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE)
stm = kio.map_memory_to_gpu(fd, handle=eop_buffer.handle, device_ids_array_ptr=ctypes.addressof(arr), n_devices=1)
assert stm.n_success == 1
ctx_save_restore_address = gpu_alloc(fd, 0x2C02000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM |
kfd.KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE |
kfd.KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE)
stm = kio.map_memory_to_gpu(fd, handle=ctx_save_restore_address.handle, device_ids_array_ptr=ctypes.addressof(arr), n_devices=1)
assert stm.n_success == 1
#113.00 ms + 0.00 ms : 0 = AMDKFD_IOC_CREATE_QUEUE ring_base_address:0x797465200000 write_pointer_address:0x79751C068038 read_pointer_address:0x79751C068080 doorbell_offset:0x0 ring_size:0x800000 gpu_id:0x433D queue_type:0x2 queue_per
#centage:0x64 queue_priority:0x7 queue_id:0x0 eop_buffer_address:0x79751C064000 eop_buffer_size:0x1000 ctx_save_restore_address:0x796E52400000 ctx_save_restore_size:0x2BEA000 ctl_stack_size:0xA000
#113.84 ms + 0.59 ms : 0 = AMDKFD_IOC_CREATE_QUEUE ring_base_address:0x71AC3F600000 write_pointer_address:0x71B302AB0038 read_pointer_address:0x71B302AB0080 doorbell_offset:0xD0CF400000000008 ring_size:0x800000 gpu_id:0x433D queue_typ
#e:0x2 queue_percentage:0x64 queue_priority:0x7 queue_id:0x1 eop_buffer_address:0x71B302AAC000 eop_buffer_size:0x1000 ctx_save_restore_address:0x71AC3C800000 ctx_save_restore_size:0x2BEA000 ctl_stack_size:0xA000
#define KFD_MMAP_TYPE_SHIFT 62
#define KFD_MMAP_TYPE_DOORBELL (0x3ULL << KFD_MMAP_TYPE_SHIFT)
evt = kio.create_event(fd, event_page_offset=event_page.handle, auto_reset=1)
nq = kio.create_queue(fd, ring_base_address=ring_base.va_addr, ring_size=0x1000, gpu_id=GPU_ID,
queue_type=kfd.KFD_IOC_QUEUE_TYPE_COMPUTE_AQL, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE,
queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
eop_buffer_address=eop_buffer.va_addr, eop_buffer_size=0x1000,
ctx_save_restore_address=ctx_save_restore_address.va_addr, ctx_save_restore_size=0x2C02000,
ctl_stack_size = 0xa000,
# write_pointer_address and read_pointer_address are on GART
#write_pointer_address=0xaaaabbbb, read_pointer_address=0xaaaacccc)
write_pointer_address=rw_ptr.va_addr+0, read_pointer_address=rw_ptr.va_addr+0x8)
doorbell = libc.mmap(0, 8192, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, fd, nq.doorbell_offset)
print("doorbell", hex(doorbell))
to_mv(signals.va_addr, 0x40)
"""
hexdump(to_mv(event_page.va_addr, 0x40))
kio.set_event(fd, event_id=evt.event_id)
hexdump(to_mv(event_page.va_addr, 0x40))
kio.reset_event(fd, event_id=evt.event_id)
hexdump(to_mv(event_page.va_addr, 0x40))
"""
# KFD_EVENT_TYPE_SIGNAL
BARRIER_HEADER = 1 << hsa.HSA_PACKET_HEADER_BARRIER
BARRIER_HEADER |= hsa.HSA_FENCE_SCOPE_SYSTEM << hsa.HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE
BARRIER_HEADER |= hsa.HSA_FENCE_SCOPE_SYSTEM << hsa.HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE
BARRIER_HEADER |= hsa.HSA_PACKET_TYPE_BARRIER_AND << hsa.HSA_PACKET_HEADER_TYPE
AQL_PACKET_SIZE = ctypes.sizeof(hsa.hsa_kernel_dispatch_packet_t)
EMPTY_SIGNAL = hsa.hsa_signal_t()
ds = to_mv(rw_ptr.va_addr, 0x100).cast("Q")
ds[0] = 1 #ring_base.va_addr + AQL_PACKET_SIZE
ds[1] = 0 #ring_base.va_addr
#libc.memset(rw_ptr.va_addr, 0xaa, 0x100)
#hexdump(to_mv(rw_ptr.va_addr, 0x100))
#packet = hsa.hsa_barrier_and_packet_t.from_address(rw_ptr.va_addr+0x38)
packet = hsa.hsa_barrier_and_packet_t.from_address(ring_base.va_addr)
packet.reserved0 = 0
packet.reserved1 = 0
for i in range(5): packet.dep_signal[i] = EMPTY_SIGNAL
#packet.dep_signal[0] = hsa.hsa_signal_t(evt.event_id)
packet.reserved2 = 0
#packet.completion_signal = EMPTY_SIGNAL
packet.completion_signal = hsa.hsa_signal_t(signals.va_addr)
packet.header = BARRIER_HEADER
hexdump(to_mv(ring_base.va_addr, AQL_PACKET_SIZE))
# _HsaEventData
to_mv(signals.va_addr, 0x40).cast("Q")[0] = 1
to_mv(signals.va_addr, 0x40).cast("Q")[1] = 1
#to_mv(signals.va_addr, 0x40).cast("Q")[2] = event_page
to_mv(signals.va_addr, 0x40).cast("Q")[2] = event_page.va_addr + evt.event_slot_index*8 # HWData2=HWAddress
to_mv(signals.va_addr, 0x40).cast("Q")[3] = evt.event_trigger_data # HWData3=HWData
print(hex(ds[0]), hex(ds[1]), hex(ds[2]))
hexdump(to_mv(signals.va_addr, 0x40))
# 10 08 49 3E 46 77 00 00
# ring doorbell
print(hex(to_mv(doorbell, 0x10).cast("I")[0]))
#to_mv(doorbell, 0x10).cast("I")[0] = 0xffffffff
to_mv(doorbell, 0x10).cast("I")[0] = 0
evt_arr = (kfd.struct_kfd_event_data * 1)()
evt_arr[0].event_id = evt.event_id
kio.wait_events(fd, events_ptr=ctypes.addressof(evt_arr), num_events=1, wait_for_all=0, timeout=1000)
print(hex(ds[0]), hex(ds[1]), hex(ds[2]))
hexdump(to_mv(signals.va_addr, 0x40))
#nq = kio.create_queue(fd, ring_base_address=buf, ring_size=0x1000, gpu_id=GPU_ID,
# queue_type=kfd.KFD_IOC_QUEUE_TYPE_COMPUTE_AQL, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE,
# queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY, write_pointer_address=buf+8, read_pointer_address=buf+0x10)
#print(nq)
#mv = to_mv(buf, 0x1000)
#addr = libc.mmap(0, 0x1000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_PRIVATE|mmap.MAP_ANONYMOUS, -1, 0)
#print('\n'.join(format_struct(ver)))
#print('\n'.join(format_struct(st)))

File diff suppressed because it is too large Load Diff

View File

@@ -3588,6 +3588,128 @@ try:
hsa_amd_vmem_get_alloc_properties_from_handle.argtypes = [hsa_amd_vmem_alloc_handle_t, ctypes.POINTER(struct_hsa_amd_memory_pool_s), ctypes.POINTER(c__EA_hsa_amd_memory_type_t)]
except AttributeError:
pass
amd_queue_properties32_t = ctypes.c_uint32
# values for enumeration 'amd_queue_properties_t'
amd_queue_properties_t__enumvalues = {
0: 'AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_SHIFT',
1: 'AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_WIDTH',
1: 'AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER',
1: 'AMD_QUEUE_PROPERTIES_IS_PTR64_SHIFT',
1: 'AMD_QUEUE_PROPERTIES_IS_PTR64_WIDTH',
2: 'AMD_QUEUE_PROPERTIES_IS_PTR64',
2: 'AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_DEBUG_SGPRS_SHIFT',
1: 'AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_DEBUG_SGPRS_WIDTH',
4: 'AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_DEBUG_SGPRS',
3: 'AMD_QUEUE_PROPERTIES_ENABLE_PROFILING_SHIFT',
1: 'AMD_QUEUE_PROPERTIES_ENABLE_PROFILING_WIDTH',
8: 'AMD_QUEUE_PROPERTIES_ENABLE_PROFILING',
4: 'AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE_SHIFT',
1: 'AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE_WIDTH',
16: 'AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE',
5: 'AMD_QUEUE_PROPERTIES_RESERVED1_SHIFT',
27: 'AMD_QUEUE_PROPERTIES_RESERVED1_WIDTH',
-32: 'AMD_QUEUE_PROPERTIES_RESERVED1',
}
AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_SHIFT = 0
AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_WIDTH = 1
AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER = 1
AMD_QUEUE_PROPERTIES_IS_PTR64_SHIFT = 1
AMD_QUEUE_PROPERTIES_IS_PTR64_WIDTH = 1
AMD_QUEUE_PROPERTIES_IS_PTR64 = 2
AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_DEBUG_SGPRS_SHIFT = 2
AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_DEBUG_SGPRS_WIDTH = 1
AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_DEBUG_SGPRS = 4
AMD_QUEUE_PROPERTIES_ENABLE_PROFILING_SHIFT = 3
AMD_QUEUE_PROPERTIES_ENABLE_PROFILING_WIDTH = 1
AMD_QUEUE_PROPERTIES_ENABLE_PROFILING = 8
AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE_SHIFT = 4
AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE_WIDTH = 1
AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE = 16
AMD_QUEUE_PROPERTIES_RESERVED1_SHIFT = 5
AMD_QUEUE_PROPERTIES_RESERVED1_WIDTH = 27
AMD_QUEUE_PROPERTIES_RESERVED1 = -32
amd_queue_properties_t = ctypes.c_int32 # enum
class struct_amd_queue_s(Structure):
pass
struct_amd_queue_s._pack_ = 1 # source:False
struct_amd_queue_s._fields_ = [
('hsa_queue', hsa_queue_t),
('reserved1', ctypes.c_uint32 * 4),
('write_dispatch_id', ctypes.c_uint64),
('group_segment_aperture_base_hi', ctypes.c_uint32),
('private_segment_aperture_base_hi', ctypes.c_uint32),
('max_cu_id', ctypes.c_uint32),
('max_wave_id', ctypes.c_uint32),
('max_legacy_doorbell_dispatch_id_plus_1', ctypes.c_uint64),
('legacy_doorbell_lock', ctypes.c_uint32),
('reserved2', ctypes.c_uint32 * 9),
('read_dispatch_id', ctypes.c_uint64),
('read_dispatch_id_field_base_byte_offset', ctypes.c_uint32),
('compute_tmpring_size', ctypes.c_uint32),
('scratch_resource_descriptor', ctypes.c_uint32 * 4),
('scratch_backing_memory_location', ctypes.c_uint64),
('scratch_backing_memory_byte_size', ctypes.c_uint64),
('scratch_wave64_lane_byte_size', ctypes.c_uint32),
('queue_properties', ctypes.c_uint32),
('reserved3', ctypes.c_uint32 * 2),
('queue_inactive_signal', hsa_signal_t),
('reserved4', ctypes.c_uint32 * 14),
]
amd_queue_t = struct_amd_queue_s
amd_signal_kind64_t = ctypes.c_int64
# values for enumeration 'amd_signal_kind_t'
amd_signal_kind_t__enumvalues = {
0: 'AMD_SIGNAL_KIND_INVALID',
1: 'AMD_SIGNAL_KIND_USER',
-1: 'AMD_SIGNAL_KIND_DOORBELL',
-2: 'AMD_SIGNAL_KIND_LEGACY_DOORBELL',
}
AMD_SIGNAL_KIND_INVALID = 0
AMD_SIGNAL_KIND_USER = 1
AMD_SIGNAL_KIND_DOORBELL = -1
AMD_SIGNAL_KIND_LEGACY_DOORBELL = -2
amd_signal_kind_t = ctypes.c_int32 # enum
class struct_amd_signal_s(Structure):
pass
class union_amd_signal_s_0(Union):
pass
union_amd_signal_s_0._pack_ = 1 # source:False
union_amd_signal_s_0._fields_ = [
('value', ctypes.c_int64),
('legacy_hardware_doorbell_ptr', ctypes.POINTER(ctypes.c_uint32)),
('hardware_doorbell_ptr', ctypes.POINTER(ctypes.c_uint64)),
]
class union_amd_signal_s_1(Union):
pass
union_amd_signal_s_1._pack_ = 1 # source:False
union_amd_signal_s_1._fields_ = [
('queue_ptr', ctypes.POINTER(struct_amd_queue_s)),
('reserved2', ctypes.c_uint64),
]
struct_amd_signal_s._pack_ = 1 # source:False
struct_amd_signal_s._anonymous_ = ('_0', '_1',)
struct_amd_signal_s._fields_ = [
('kind', ctypes.c_int64),
('_0', union_amd_signal_s_0),
('event_mailbox_ptr', ctypes.c_uint64),
('event_id', ctypes.c_uint32),
('reserved1', ctypes.c_uint32),
('start_ts', ctypes.c_uint64),
('end_ts', ctypes.c_uint64),
('_1', union_amd_signal_s_1),
('reserved3', ctypes.c_uint32 * 2),
]
amd_signal_t = struct_amd_signal_s
class struct_BrigModuleHeader(Structure):
pass
@@ -3713,7 +3835,27 @@ struct_hsa_ext_finalizer_1_00_pfn_s._fields_ = [
hsa_ext_finalizer_1_00_pfn_t = struct_hsa_ext_finalizer_1_00_pfn_s
__all__ = \
['BrigModule_t', 'HSA_ACCESS_PERMISSION_NONE',
['AMD_QUEUE_PROPERTIES_ENABLE_PROFILING',
'AMD_QUEUE_PROPERTIES_ENABLE_PROFILING_SHIFT',
'AMD_QUEUE_PROPERTIES_ENABLE_PROFILING_WIDTH',
'AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER',
'AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_DEBUG_SGPRS',
'AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_DEBUG_SGPRS_SHIFT',
'AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_DEBUG_SGPRS_WIDTH',
'AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_SHIFT',
'AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_WIDTH',
'AMD_QUEUE_PROPERTIES_IS_PTR64',
'AMD_QUEUE_PROPERTIES_IS_PTR64_SHIFT',
'AMD_QUEUE_PROPERTIES_IS_PTR64_WIDTH',
'AMD_QUEUE_PROPERTIES_RESERVED1',
'AMD_QUEUE_PROPERTIES_RESERVED1_SHIFT',
'AMD_QUEUE_PROPERTIES_RESERVED1_WIDTH',
'AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE',
'AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE_SHIFT',
'AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE_WIDTH',
'AMD_SIGNAL_KIND_DOORBELL', 'AMD_SIGNAL_KIND_INVALID',
'AMD_SIGNAL_KIND_LEGACY_DOORBELL', 'AMD_SIGNAL_KIND_USER',
'BrigModule_t', 'HSA_ACCESS_PERMISSION_NONE',
'HSA_ACCESS_PERMISSION_RO', 'HSA_ACCESS_PERMISSION_RW',
'HSA_ACCESS_PERMISSION_WO', 'HSA_AGENT_FEATURE_AGENT_DISPATCH',
'HSA_AGENT_FEATURE_KERNEL_DISPATCH',
@@ -4082,8 +4224,10 @@ __all__ = \
'HSA_VARIABLE_SEGMENT_READONLY', 'HSA_WAIT_STATE_ACTIVE',
'HSA_WAIT_STATE_BLOCKED', 'HSA_WAVEFRONT_INFO_SIZE',
'MEMORY_TYPE_NONE', 'MEMORY_TYPE_PINNED',
'c__EA_hsa_access_permission_t', 'c__EA_hsa_agent_feature_t',
'c__EA_hsa_agent_info_t',
'amd_queue_properties32_t', 'amd_queue_properties_t',
'amd_queue_t', 'amd_signal_kind64_t', 'amd_signal_kind_t',
'amd_signal_t', 'c__EA_hsa_access_permission_t',
'c__EA_hsa_agent_feature_t', 'c__EA_hsa_agent_info_t',
'c__EA_hsa_amd_agent_memory_pool_info_t',
'c__EA_hsa_amd_copy_direction_t',
'c__EA_hsa_amd_hw_exception_reset_cause_t',
@@ -4422,6 +4566,7 @@ __all__ = \
'hsa_wait_state_t__enumvalues', 'hsa_wavefront_get_info',
'hsa_wavefront_info_t', 'hsa_wavefront_info_t__enumvalues',
'hsa_wavefront_t', 'int32_t', 'size_t', 'struct_BrigModuleHeader',
'struct_amd_queue_s', 'struct_amd_signal_s',
'struct_hsa_agent_dispatch_packet_s', 'struct_hsa_agent_s',
'struct_hsa_amd_barrier_value_packet_s', 'struct_hsa_amd_event_s',
'struct_hsa_amd_gpu_hw_exception_info_s',
@@ -4455,4 +4600,5 @@ __all__ = \
'struct_hsa_queue_s', 'struct_hsa_region_s',
'struct_hsa_signal_group_s', 'struct_hsa_signal_s',
'struct_hsa_wavefront_s', 'uint16_t', 'uint32_t', 'uint64_t',
'union_amd_signal_s_0', 'union_amd_signal_s_1',
'union_hsa_amd_event_s_0']

View File

@@ -0,0 +1,812 @@
# mypy: ignore-errors
# -*- coding: utf-8 -*-
#
# TARGET arch is: []
# WORD_SIZE is: 8
# POINTER_SIZE is: 8
# LONGDOUBLE_SIZE is: 16
#
import ctypes, os
class AsDictMixin:
@classmethod
def as_dict(cls, self):
result = {}
if not isinstance(self, AsDictMixin):
# not a structure, assume it's already a python object
return self
if not hasattr(cls, "_fields_"):
return result
# sys.version_info >= (3, 5)
# for (field, *_) in cls._fields_: # noqa
for field_tuple in cls._fields_: # noqa
field = field_tuple[0]
if field.startswith('PADDING_'):
continue
value = getattr(self, field)
type_ = type(value)
if hasattr(value, "_length_") and hasattr(value, "_type_"):
# array
if not hasattr(type_, "as_dict"):
value = [v for v in value]
else:
type_ = type_._type_
value = [type_.as_dict(v) for v in value]
elif hasattr(value, "contents") and hasattr(value, "_type_"):
# pointer
try:
if not hasattr(type_, "as_dict"):
value = value.contents
else:
type_ = type_._type_
value = type_.as_dict(value.contents)
except ValueError:
# nullptr
value = None
elif isinstance(value, AsDictMixin):
# other structure
value = type_.as_dict(value)
result[field] = value
return result
class Structure(ctypes.Structure, AsDictMixin):
def __init__(self, *args, **kwds):
# We don't want to use positional arguments fill PADDING_* fields
args = dict(zip(self.__class__._field_names_(), args))
args.update(kwds)
super(Structure, self).__init__(**args)
@classmethod
def _field_names_(cls):
if hasattr(cls, '_fields_'):
return (f[0] for f in cls._fields_ if not f[0].startswith('PADDING'))
else:
return ()
@classmethod
def get_type(cls, field):
for f in cls._fields_:
if f[0] == field:
return f[1]
return None
@classmethod
def bind(cls, bound_fields):
fields = {}
for name, type_ in cls._fields_:
if hasattr(type_, "restype"):
if name in bound_fields:
if bound_fields[name] is None:
fields[name] = type_()
else:
# use a closure to capture the callback from the loop scope
fields[name] = (
type_((lambda callback: lambda *args: callback(*args))(
bound_fields[name]))
)
del bound_fields[name]
else:
# default callback implementation (does nothing)
try:
default_ = type_(0).restype().value
except TypeError:
default_ = None
fields[name] = type_((
lambda default_: lambda *args: default_)(default_))
else:
# not a callback function, use default initialization
if name in bound_fields:
fields[name] = bound_fields[name]
del bound_fields[name]
else:
fields[name] = type_()
if len(bound_fields) != 0:
raise ValueError(
"Cannot bind the following unknown callback(s) {}.{}".format(
cls.__name__, bound_fields.keys()
))
return cls(**fields)
class Union(ctypes.Union, AsDictMixin):
pass
KFD_IOCTL_H_INCLUDED = True # macro
KFD_IOCTL_MAJOR_VERSION = 1 # macro
KFD_IOCTL_MINOR_VERSION = 6 # macro
KFD_IOC_QUEUE_TYPE_COMPUTE = 0x0 # macro
KFD_IOC_QUEUE_TYPE_SDMA = 0x1 # macro
KFD_IOC_QUEUE_TYPE_COMPUTE_AQL = 0x2 # macro
KFD_IOC_QUEUE_TYPE_SDMA_XGMI = 0x3 # macro
KFD_MAX_QUEUE_PERCENTAGE = 100 # macro
KFD_MAX_QUEUE_PRIORITY = 15 # macro
KFD_IOC_CACHE_POLICY_COHERENT = 0 # macro
KFD_IOC_CACHE_POLICY_NONCOHERENT = 1 # macro
NUM_OF_SUPPORTED_GPUS = 7 # macro
MAX_ALLOWED_NUM_POINTS = 100 # macro
MAX_ALLOWED_AW_BUFF_SIZE = 4096 # macro
MAX_ALLOWED_WAC_BUFF_SIZE = 128 # macro
KFD_IOC_EVENT_SIGNAL = 0 # macro
KFD_IOC_EVENT_NODECHANGE = 1 # macro
KFD_IOC_EVENT_DEVICESTATECHANGE = 2 # macro
KFD_IOC_EVENT_HW_EXCEPTION = 3 # macro
KFD_IOC_EVENT_SYSTEM_EVENT = 4 # macro
KFD_IOC_EVENT_DEBUG_EVENT = 5 # macro
KFD_IOC_EVENT_PROFILE_EVENT = 6 # macro
KFD_IOC_EVENT_QUEUE_EVENT = 7 # macro
KFD_IOC_EVENT_MEMORY = 8 # macro
KFD_IOC_WAIT_RESULT_COMPLETE = 0 # macro
KFD_IOC_WAIT_RESULT_TIMEOUT = 1 # macro
KFD_IOC_WAIT_RESULT_FAIL = 2 # macro
KFD_SIGNAL_EVENT_LIMIT = 4096 # macro
KFD_HW_EXCEPTION_WHOLE_GPU_RESET = 0 # macro
KFD_HW_EXCEPTION_PER_ENGINE_RESET = 1 # macro
KFD_HW_EXCEPTION_GPU_HANG = 0 # macro
KFD_HW_EXCEPTION_ECC = 1 # macro
KFD_MEM_ERR_NO_RAS = 0 # macro
KFD_MEM_ERR_SRAM_ECC = 1 # macro
KFD_MEM_ERR_POISON_CONSUMED = 2 # macro
KFD_MEM_ERR_GPU_HANG = 3 # macro
KFD_IOC_ALLOC_MEM_FLAGS_VRAM = (1<<0) # macro
KFD_IOC_ALLOC_MEM_FLAGS_GTT = (1<<1) # macro
KFD_IOC_ALLOC_MEM_FLAGS_USERPTR = (1<<2) # macro
KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL = (1<<3) # macro
KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP = (1<<4) # macro
KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE = (1<<31) # macro
KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE = (1<<30) # macro
KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC = (1<<29) # macro
KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE = (1<<28) # macro
KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM = (1<<27) # macro
KFD_IOC_ALLOC_MEM_FLAGS_COHERENT = (1<<26) # macro
KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED = (1<<25) # macro
# def KFD_SMI_EVENT_MASK_FROM_INDEX(i): # macro
# return (1<<((i)-1))
KFD_IOCTL_SVM_FLAG_HOST_ACCESS = 0x00000001 # macro
KFD_IOCTL_SVM_FLAG_COHERENT = 0x00000002 # macro
KFD_IOCTL_SVM_FLAG_HIVE_LOCAL = 0x00000004 # macro
KFD_IOCTL_SVM_FLAG_GPU_RO = 0x00000008 # macro
KFD_IOCTL_SVM_FLAG_GPU_EXEC = 0x00000010 # macro
KFD_IOCTL_SVM_FLAG_GPU_READ_MOSTLY = 0x00000020 # macro
AMDKFD_IOCTL_BASE = 'K' # macro
# def AMDKFD_IO(nr): # macro
# return _IO('K',nr)
# def AMDKFD_IOR(nr, type): # macro
# return _IOR('K',nr,type)
# def AMDKFD_IOW(nr, type): # macro
# return _IOW('K',nr,type)
# def AMDKFD_IOWR(nr, type): # macro
# return _IOWR('K',nr,type)
# AMDKFD_IOC_GET_VERSION = _IOR('K',nr,type) ( 0x01 , struct kfd_ioctl_get_version_args ) # macro
# AMDKFD_IOC_CREATE_QUEUE = _IOWR('K',nr,type) ( 0x02 , struct kfd_ioctl_create_queue_args ) # macro
# AMDKFD_IOC_DESTROY_QUEUE = _IOWR('K',nr,type) ( 0x03 , struct kfd_ioctl_destroy_queue_args ) # macro
# AMDKFD_IOC_SET_MEMORY_POLICY = _IOW('K',nr,type) ( 0x04 , struct kfd_ioctl_set_memory_policy_args ) # macro
# AMDKFD_IOC_GET_CLOCK_COUNTERS = _IOWR('K',nr,type) ( 0x05 , struct kfd_ioctl_get_clock_counters_args ) # macro
# AMDKFD_IOC_GET_PROCESS_APERTURES = _IOR('K',nr,type) ( 0x06 , struct kfd_ioctl_get_process_apertures_args ) # macro
# AMDKFD_IOC_UPDATE_QUEUE = _IOW('K',nr,type) ( 0x07 , struct kfd_ioctl_update_queue_args ) # macro
# AMDKFD_IOC_CREATE_EVENT = _IOWR('K',nr,type) ( 0x08 , struct kfd_ioctl_create_event_args ) # macro
# AMDKFD_IOC_DESTROY_EVENT = _IOW('K',nr,type) ( 0x09 , struct kfd_ioctl_destroy_event_args ) # macro
# AMDKFD_IOC_SET_EVENT = _IOW('K',nr,type) ( 0x0A , struct kfd_ioctl_set_event_args ) # macro
# AMDKFD_IOC_RESET_EVENT = _IOW('K',nr,type) ( 0x0B , struct kfd_ioctl_reset_event_args ) # macro
# AMDKFD_IOC_WAIT_EVENTS = _IOWR('K',nr,type) ( 0x0C , struct kfd_ioctl_wait_events_args ) # macro
# AMDKFD_IOC_DBG_REGISTER = _IOW('K',nr,type) ( 0x0D , struct kfd_ioctl_dbg_register_args ) # macro
# AMDKFD_IOC_DBG_UNREGISTER = _IOW('K',nr,type) ( 0x0E , struct kfd_ioctl_dbg_unregister_args ) # macro
# AMDKFD_IOC_DBG_ADDRESS_WATCH = _IOW('K',nr,type) ( 0x0F , struct kfd_ioctl_dbg_address_watch_args ) # macro
# AMDKFD_IOC_DBG_WAVE_CONTROL = _IOW('K',nr,type) ( 0x10 , struct kfd_ioctl_dbg_wave_control_args ) # macro
# AMDKFD_IOC_SET_SCRATCH_BACKING_VA = _IOWR('K',nr,type) ( 0x11 , struct kfd_ioctl_set_scratch_backing_va_args ) # macro
# AMDKFD_IOC_GET_TILE_CONFIG = _IOWR('K',nr,type) ( 0x12 , struct kfd_ioctl_get_tile_config_args ) # macro
# AMDKFD_IOC_SET_TRAP_HANDLER = _IOW('K',nr,type) ( 0x13 , struct kfd_ioctl_set_trap_handler_args ) # macro
# AMDKFD_IOC_GET_PROCESS_APERTURES_NEW = _IOWR('K',nr,type) ( 0x14 , struct kfd_ioctl_get_process_apertures_new_args ) # macro
# AMDKFD_IOC_ACQUIRE_VM = _IOW('K',nr,type) ( 0x15 , struct kfd_ioctl_acquire_vm_args ) # macro
# AMDKFD_IOC_ALLOC_MEMORY_OF_GPU = _IOWR('K',nr,type) ( 0x16 , struct kfd_ioctl_alloc_memory_of_gpu_args ) # macro
# AMDKFD_IOC_FREE_MEMORY_OF_GPU = _IOW('K',nr,type) ( 0x17 , struct kfd_ioctl_free_memory_of_gpu_args ) # macro
# AMDKFD_IOC_MAP_MEMORY_TO_GPU = _IOWR('K',nr,type) ( 0x18 , struct kfd_ioctl_map_memory_to_gpu_args ) # macro
# AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU = _IOWR('K',nr,type) ( 0x19 , struct kfd_ioctl_unmap_memory_from_gpu_args ) # macro
# AMDKFD_IOC_SET_CU_MASK = _IOW('K',nr,type) ( 0x1A , struct kfd_ioctl_set_cu_mask_args ) # macro
# AMDKFD_IOC_GET_QUEUE_WAVE_STATE = _IOWR('K',nr,type) ( 0x1B , struct kfd_ioctl_get_queue_wave_state_args ) # macro
# AMDKFD_IOC_GET_DMABUF_INFO = _IOWR('K',nr,type) ( 0x1C , struct kfd_ioctl_get_dmabuf_info_args ) # macro
# AMDKFD_IOC_IMPORT_DMABUF = _IOWR('K',nr,type) ( 0x1D , struct kfd_ioctl_import_dmabuf_args ) # macro
# AMDKFD_IOC_ALLOC_QUEUE_GWS = _IOWR('K',nr,type) ( 0x1E , struct kfd_ioctl_alloc_queue_gws_args ) # macro
# AMDKFD_IOC_SMI_EVENTS = _IOWR('K',nr,type) ( 0x1F , struct kfd_ioctl_smi_events_args ) # macro
# AMDKFD_IOC_SVM = _IOWR('K',nr,type) ( 0x20 , struct kfd_ioctl_svm_args ) # macro
# AMDKFD_IOC_SET_XNACK_MODE = _IOWR('K',nr,type) ( 0x21 , struct kfd_ioctl_set_xnack_mode_args ) # macro
AMDKFD_COMMAND_START = 0x01 # macro
AMDKFD_COMMAND_END = 0x22 # macro
class struct_kfd_ioctl_get_version_args(Structure):
pass
struct_kfd_ioctl_get_version_args._pack_ = 1 # source:False
struct_kfd_ioctl_get_version_args._fields_ = [
('major_version', ctypes.c_uint32),
('minor_version', ctypes.c_uint32),
]
class struct_kfd_ioctl_create_queue_args(Structure):
pass
struct_kfd_ioctl_create_queue_args._pack_ = 1 # source:False
struct_kfd_ioctl_create_queue_args._fields_ = [
('ring_base_address', ctypes.c_uint64),
('write_pointer_address', ctypes.c_uint64),
('read_pointer_address', ctypes.c_uint64),
('doorbell_offset', ctypes.c_uint64),
('ring_size', ctypes.c_uint32),
('gpu_id', ctypes.c_uint32),
('queue_type', ctypes.c_uint32),
('queue_percentage', ctypes.c_uint32),
('queue_priority', ctypes.c_uint32),
('queue_id', ctypes.c_uint32),
('eop_buffer_address', ctypes.c_uint64),
('eop_buffer_size', ctypes.c_uint64),
('ctx_save_restore_address', ctypes.c_uint64),
('ctx_save_restore_size', ctypes.c_uint32),
('ctl_stack_size', ctypes.c_uint32),
]
class struct_kfd_ioctl_destroy_queue_args(Structure):
pass
struct_kfd_ioctl_destroy_queue_args._pack_ = 1 # source:False
struct_kfd_ioctl_destroy_queue_args._fields_ = [
('queue_id', ctypes.c_uint32),
('pad', ctypes.c_uint32),
]
class struct_kfd_ioctl_update_queue_args(Structure):
pass
struct_kfd_ioctl_update_queue_args._pack_ = 1 # source:False
struct_kfd_ioctl_update_queue_args._fields_ = [
('ring_base_address', ctypes.c_uint64),
('queue_id', ctypes.c_uint32),
('ring_size', ctypes.c_uint32),
('queue_percentage', ctypes.c_uint32),
('queue_priority', ctypes.c_uint32),
]
class struct_kfd_ioctl_set_cu_mask_args(Structure):
pass
struct_kfd_ioctl_set_cu_mask_args._pack_ = 1 # source:False
struct_kfd_ioctl_set_cu_mask_args._fields_ = [
('queue_id', ctypes.c_uint32),
('num_cu_mask', ctypes.c_uint32),
('cu_mask_ptr', ctypes.c_uint64),
]
class struct_kfd_ioctl_get_queue_wave_state_args(Structure):
pass
struct_kfd_ioctl_get_queue_wave_state_args._pack_ = 1 # source:False
struct_kfd_ioctl_get_queue_wave_state_args._fields_ = [
('ctl_stack_address', ctypes.c_uint64),
('ctl_stack_used_size', ctypes.c_uint32),
('save_area_used_size', ctypes.c_uint32),
('queue_id', ctypes.c_uint32),
('pad', ctypes.c_uint32),
]
class struct_kfd_ioctl_set_memory_policy_args(Structure):
pass
struct_kfd_ioctl_set_memory_policy_args._pack_ = 1 # source:False
struct_kfd_ioctl_set_memory_policy_args._fields_ = [
('alternate_aperture_base', ctypes.c_uint64),
('alternate_aperture_size', ctypes.c_uint64),
('gpu_id', ctypes.c_uint32),
('default_policy', ctypes.c_uint32),
('alternate_policy', ctypes.c_uint32),
('pad', ctypes.c_uint32),
]
class struct_kfd_ioctl_get_clock_counters_args(Structure):
pass
struct_kfd_ioctl_get_clock_counters_args._pack_ = 1 # source:False
struct_kfd_ioctl_get_clock_counters_args._fields_ = [
('gpu_clock_counter', ctypes.c_uint64),
('cpu_clock_counter', ctypes.c_uint64),
('system_clock_counter', ctypes.c_uint64),
('system_clock_freq', ctypes.c_uint64),
('gpu_id', ctypes.c_uint32),
('pad', ctypes.c_uint32),
]
class struct_kfd_process_device_apertures(Structure):
pass
struct_kfd_process_device_apertures._pack_ = 1 # source:False
struct_kfd_process_device_apertures._fields_ = [
('lds_base', ctypes.c_uint64),
('lds_limit', ctypes.c_uint64),
('scratch_base', ctypes.c_uint64),
('scratch_limit', ctypes.c_uint64),
('gpuvm_base', ctypes.c_uint64),
('gpuvm_limit', ctypes.c_uint64),
('gpu_id', ctypes.c_uint32),
('pad', ctypes.c_uint32),
]
class struct_kfd_ioctl_get_process_apertures_args(Structure):
pass
struct_kfd_ioctl_get_process_apertures_args._pack_ = 1 # source:False
struct_kfd_ioctl_get_process_apertures_args._fields_ = [
('process_apertures', struct_kfd_process_device_apertures * 7),
('num_of_nodes', ctypes.c_uint32),
('pad', ctypes.c_uint32),
]
class struct_kfd_ioctl_get_process_apertures_new_args(Structure):
pass
struct_kfd_ioctl_get_process_apertures_new_args._pack_ = 1 # source:False
struct_kfd_ioctl_get_process_apertures_new_args._fields_ = [
('kfd_process_device_apertures_ptr', ctypes.c_uint64),
('num_of_nodes', ctypes.c_uint32),
('pad', ctypes.c_uint32),
]
class struct_kfd_ioctl_dbg_register_args(Structure):
pass
struct_kfd_ioctl_dbg_register_args._pack_ = 1 # source:False
struct_kfd_ioctl_dbg_register_args._fields_ = [
('gpu_id', ctypes.c_uint32),
('pad', ctypes.c_uint32),
]
class struct_kfd_ioctl_dbg_unregister_args(Structure):
pass
struct_kfd_ioctl_dbg_unregister_args._pack_ = 1 # source:False
struct_kfd_ioctl_dbg_unregister_args._fields_ = [
('gpu_id', ctypes.c_uint32),
('pad', ctypes.c_uint32),
]
class struct_kfd_ioctl_dbg_address_watch_args(Structure):
pass
struct_kfd_ioctl_dbg_address_watch_args._pack_ = 1 # source:False
struct_kfd_ioctl_dbg_address_watch_args._fields_ = [
('content_ptr', ctypes.c_uint64),
('gpu_id', ctypes.c_uint32),
('buf_size_in_bytes', ctypes.c_uint32),
]
class struct_kfd_ioctl_dbg_wave_control_args(Structure):
pass
struct_kfd_ioctl_dbg_wave_control_args._pack_ = 1 # source:False
struct_kfd_ioctl_dbg_wave_control_args._fields_ = [
('content_ptr', ctypes.c_uint64),
('gpu_id', ctypes.c_uint32),
('buf_size_in_bytes', ctypes.c_uint32),
]
class struct_kfd_ioctl_create_event_args(Structure):
pass
struct_kfd_ioctl_create_event_args._pack_ = 1 # source:False
struct_kfd_ioctl_create_event_args._fields_ = [
('event_page_offset', ctypes.c_uint64),
('event_trigger_data', ctypes.c_uint32),
('event_type', ctypes.c_uint32),
('auto_reset', ctypes.c_uint32),
('node_id', ctypes.c_uint32),
('event_id', ctypes.c_uint32),
('event_slot_index', ctypes.c_uint32),
]
class struct_kfd_ioctl_destroy_event_args(Structure):
pass
struct_kfd_ioctl_destroy_event_args._pack_ = 1 # source:False
struct_kfd_ioctl_destroy_event_args._fields_ = [
('event_id', ctypes.c_uint32),
('pad', ctypes.c_uint32),
]
class struct_kfd_ioctl_set_event_args(Structure):
pass
struct_kfd_ioctl_set_event_args._pack_ = 1 # source:False
struct_kfd_ioctl_set_event_args._fields_ = [
('event_id', ctypes.c_uint32),
('pad', ctypes.c_uint32),
]
class struct_kfd_ioctl_reset_event_args(Structure):
pass
struct_kfd_ioctl_reset_event_args._pack_ = 1 # source:False
struct_kfd_ioctl_reset_event_args._fields_ = [
('event_id', ctypes.c_uint32),
('pad', ctypes.c_uint32),
]
class struct_kfd_memory_exception_failure(Structure):
pass
struct_kfd_memory_exception_failure._pack_ = 1 # source:False
struct_kfd_memory_exception_failure._fields_ = [
('NotPresent', ctypes.c_uint32),
('ReadOnly', ctypes.c_uint32),
('NoExecute', ctypes.c_uint32),
('imprecise', ctypes.c_uint32),
]
class struct_kfd_hsa_memory_exception_data(Structure):
pass
struct_kfd_hsa_memory_exception_data._pack_ = 1 # source:False
struct_kfd_hsa_memory_exception_data._fields_ = [
('failure', struct_kfd_memory_exception_failure),
('va', ctypes.c_uint64),
('gpu_id', ctypes.c_uint32),
('ErrorType', ctypes.c_uint32),
]
class struct_kfd_hsa_hw_exception_data(Structure):
pass
struct_kfd_hsa_hw_exception_data._pack_ = 1 # source:False
struct_kfd_hsa_hw_exception_data._fields_ = [
('reset_type', ctypes.c_uint32),
('reset_cause', ctypes.c_uint32),
('memory_lost', ctypes.c_uint32),
('gpu_id', ctypes.c_uint32),
]
class struct_kfd_event_data(Structure):
pass
class union_kfd_event_data_0(Union):
pass
union_kfd_event_data_0._pack_ = 1 # source:False
union_kfd_event_data_0._fields_ = [
('memory_exception_data', struct_kfd_hsa_memory_exception_data),
('hw_exception_data', struct_kfd_hsa_hw_exception_data),
('PADDING_0', ctypes.c_ubyte * 16),
]
struct_kfd_event_data._pack_ = 1 # source:False
struct_kfd_event_data._anonymous_ = ('_0',)
struct_kfd_event_data._fields_ = [
('_0', union_kfd_event_data_0),
('kfd_event_data_ext', ctypes.c_uint64),
('event_id', ctypes.c_uint32),
('pad', ctypes.c_uint32),
]
class struct_kfd_ioctl_wait_events_args(Structure):
pass
struct_kfd_ioctl_wait_events_args._pack_ = 1 # source:False
struct_kfd_ioctl_wait_events_args._fields_ = [
('events_ptr', ctypes.c_uint64),
('num_events', ctypes.c_uint32),
('wait_for_all', ctypes.c_uint32),
('timeout', ctypes.c_uint32),
('wait_result', ctypes.c_uint32),
]
class struct_kfd_ioctl_set_scratch_backing_va_args(Structure):
pass
struct_kfd_ioctl_set_scratch_backing_va_args._pack_ = 1 # source:False
struct_kfd_ioctl_set_scratch_backing_va_args._fields_ = [
('va_addr', ctypes.c_uint64),
('gpu_id', ctypes.c_uint32),
('pad', ctypes.c_uint32),
]
class struct_kfd_ioctl_get_tile_config_args(Structure):
pass
struct_kfd_ioctl_get_tile_config_args._pack_ = 1 # source:False
struct_kfd_ioctl_get_tile_config_args._fields_ = [
('tile_config_ptr', ctypes.c_uint64),
('macro_tile_config_ptr', ctypes.c_uint64),
('num_tile_configs', ctypes.c_uint32),
('num_macro_tile_configs', ctypes.c_uint32),
('gpu_id', ctypes.c_uint32),
('gb_addr_config', ctypes.c_uint32),
('num_banks', ctypes.c_uint32),
('num_ranks', ctypes.c_uint32),
]
class struct_kfd_ioctl_set_trap_handler_args(Structure):
pass
struct_kfd_ioctl_set_trap_handler_args._pack_ = 1 # source:False
struct_kfd_ioctl_set_trap_handler_args._fields_ = [
('tba_addr', ctypes.c_uint64),
('tma_addr', ctypes.c_uint64),
('gpu_id', ctypes.c_uint32),
('pad', ctypes.c_uint32),
]
class struct_kfd_ioctl_acquire_vm_args(Structure):
pass
struct_kfd_ioctl_acquire_vm_args._pack_ = 1 # source:False
struct_kfd_ioctl_acquire_vm_args._fields_ = [
('drm_fd', ctypes.c_uint32),
('gpu_id', ctypes.c_uint32),
]
class struct_kfd_ioctl_alloc_memory_of_gpu_args(Structure):
pass
struct_kfd_ioctl_alloc_memory_of_gpu_args._pack_ = 1 # source:False
struct_kfd_ioctl_alloc_memory_of_gpu_args._fields_ = [
('va_addr', ctypes.c_uint64),
('size', ctypes.c_uint64),
('handle', ctypes.c_uint64),
('mmap_offset', ctypes.c_uint64),
('gpu_id', ctypes.c_uint32),
('flags', ctypes.c_uint32),
]
class struct_kfd_ioctl_free_memory_of_gpu_args(Structure):
pass
struct_kfd_ioctl_free_memory_of_gpu_args._pack_ = 1 # source:False
struct_kfd_ioctl_free_memory_of_gpu_args._fields_ = [
('handle', ctypes.c_uint64),
]
class struct_kfd_ioctl_map_memory_to_gpu_args(Structure):
pass
struct_kfd_ioctl_map_memory_to_gpu_args._pack_ = 1 # source:False
struct_kfd_ioctl_map_memory_to_gpu_args._fields_ = [
('handle', ctypes.c_uint64),
('device_ids_array_ptr', ctypes.c_uint64),
('n_devices', ctypes.c_uint32),
('n_success', ctypes.c_uint32),
]
class struct_kfd_ioctl_unmap_memory_from_gpu_args(Structure):
pass
struct_kfd_ioctl_unmap_memory_from_gpu_args._pack_ = 1 # source:False
struct_kfd_ioctl_unmap_memory_from_gpu_args._fields_ = [
('handle', ctypes.c_uint64),
('device_ids_array_ptr', ctypes.c_uint64),
('n_devices', ctypes.c_uint32),
('n_success', ctypes.c_uint32),
]
class struct_kfd_ioctl_alloc_queue_gws_args(Structure):
pass
struct_kfd_ioctl_alloc_queue_gws_args._pack_ = 1 # source:False
struct_kfd_ioctl_alloc_queue_gws_args._fields_ = [
('queue_id', ctypes.c_uint32),
('num_gws', ctypes.c_uint32),
('first_gws', ctypes.c_uint32),
('pad', ctypes.c_uint32),
]
class struct_kfd_ioctl_get_dmabuf_info_args(Structure):
pass
struct_kfd_ioctl_get_dmabuf_info_args._pack_ = 1 # source:False
struct_kfd_ioctl_get_dmabuf_info_args._fields_ = [
('size', ctypes.c_uint64),
('metadata_ptr', ctypes.c_uint64),
('metadata_size', ctypes.c_uint32),
('gpu_id', ctypes.c_uint32),
('flags', ctypes.c_uint32),
('dmabuf_fd', ctypes.c_uint32),
]
class struct_kfd_ioctl_import_dmabuf_args(Structure):
pass
struct_kfd_ioctl_import_dmabuf_args._pack_ = 1 # source:False
struct_kfd_ioctl_import_dmabuf_args._fields_ = [
('va_addr', ctypes.c_uint64),
('handle', ctypes.c_uint64),
('gpu_id', ctypes.c_uint32),
('dmabuf_fd', ctypes.c_uint32),
]
# values for enumeration 'kfd_smi_event'
kfd_smi_event__enumvalues = {
0: 'KFD_SMI_EVENT_NONE',
1: 'KFD_SMI_EVENT_VMFAULT',
2: 'KFD_SMI_EVENT_THERMAL_THROTTLE',
3: 'KFD_SMI_EVENT_GPU_PRE_RESET',
4: 'KFD_SMI_EVENT_GPU_POST_RESET',
}
KFD_SMI_EVENT_NONE = 0
KFD_SMI_EVENT_VMFAULT = 1
KFD_SMI_EVENT_THERMAL_THROTTLE = 2
KFD_SMI_EVENT_GPU_PRE_RESET = 3
KFD_SMI_EVENT_GPU_POST_RESET = 4
kfd_smi_event = ctypes.c_uint32 # enum
class struct_kfd_ioctl_smi_events_args(Structure):
pass
struct_kfd_ioctl_smi_events_args._pack_ = 1 # source:False
struct_kfd_ioctl_smi_events_args._fields_ = [
('gpuid', ctypes.c_uint32),
('anon_fd', ctypes.c_uint32),
]
# values for enumeration 'kfd_mmio_remap'
kfd_mmio_remap__enumvalues = {
0: 'KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL',
4: 'KFD_MMIO_REMAP_HDP_REG_FLUSH_CNTL',
}
KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL = 0
KFD_MMIO_REMAP_HDP_REG_FLUSH_CNTL = 4
kfd_mmio_remap = ctypes.c_uint32 # enum
# values for enumeration 'kfd_ioctl_svm_op'
kfd_ioctl_svm_op__enumvalues = {
0: 'KFD_IOCTL_SVM_OP_SET_ATTR',
1: 'KFD_IOCTL_SVM_OP_GET_ATTR',
}
KFD_IOCTL_SVM_OP_SET_ATTR = 0
KFD_IOCTL_SVM_OP_GET_ATTR = 1
kfd_ioctl_svm_op = ctypes.c_uint32 # enum
# values for enumeration 'kfd_ioctl_svm_location'
kfd_ioctl_svm_location__enumvalues = {
0: 'KFD_IOCTL_SVM_LOCATION_SYSMEM',
4294967295: 'KFD_IOCTL_SVM_LOCATION_UNDEFINED',
}
KFD_IOCTL_SVM_LOCATION_SYSMEM = 0
KFD_IOCTL_SVM_LOCATION_UNDEFINED = 4294967295
kfd_ioctl_svm_location = ctypes.c_uint32 # enum
# values for enumeration 'kfd_ioctl_svm_attr_type'
kfd_ioctl_svm_attr_type__enumvalues = {
0: 'KFD_IOCTL_SVM_ATTR_PREFERRED_LOC',
1: 'KFD_IOCTL_SVM_ATTR_PREFETCH_LOC',
2: 'KFD_IOCTL_SVM_ATTR_ACCESS',
3: 'KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE',
4: 'KFD_IOCTL_SVM_ATTR_NO_ACCESS',
5: 'KFD_IOCTL_SVM_ATTR_SET_FLAGS',
6: 'KFD_IOCTL_SVM_ATTR_CLR_FLAGS',
7: 'KFD_IOCTL_SVM_ATTR_GRANULARITY',
}
KFD_IOCTL_SVM_ATTR_PREFERRED_LOC = 0
KFD_IOCTL_SVM_ATTR_PREFETCH_LOC = 1
KFD_IOCTL_SVM_ATTR_ACCESS = 2
KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE = 3
KFD_IOCTL_SVM_ATTR_NO_ACCESS = 4
KFD_IOCTL_SVM_ATTR_SET_FLAGS = 5
KFD_IOCTL_SVM_ATTR_CLR_FLAGS = 6
KFD_IOCTL_SVM_ATTR_GRANULARITY = 7
kfd_ioctl_svm_attr_type = ctypes.c_uint32 # enum
class struct_kfd_ioctl_svm_attribute(Structure):
pass
struct_kfd_ioctl_svm_attribute._pack_ = 1 # source:False
struct_kfd_ioctl_svm_attribute._fields_ = [
('type', ctypes.c_uint32),
('value', ctypes.c_uint32),
]
class struct_kfd_ioctl_svm_args(Structure):
pass
struct_kfd_ioctl_svm_args._pack_ = 1 # source:False
struct_kfd_ioctl_svm_args._fields_ = [
('start_addr', ctypes.c_uint64),
('size', ctypes.c_uint64),
('op', ctypes.c_uint32),
('nattr', ctypes.c_uint32),
('attrs', struct_kfd_ioctl_svm_attribute * 0),
]
class struct_kfd_ioctl_set_xnack_mode_args(Structure):
pass
struct_kfd_ioctl_set_xnack_mode_args._pack_ = 1 # source:False
struct_kfd_ioctl_set_xnack_mode_args._fields_ = [
('xnack_enabled', ctypes.c_int32),
]
__all__ = \
['AMDKFD_COMMAND_END', 'AMDKFD_COMMAND_START',
'AMDKFD_IOCTL_BASE', 'KFD_HW_EXCEPTION_ECC',
'KFD_HW_EXCEPTION_GPU_HANG', 'KFD_HW_EXCEPTION_PER_ENGINE_RESET',
'KFD_HW_EXCEPTION_WHOLE_GPU_RESET', 'KFD_IOCTL_H_INCLUDED',
'KFD_IOCTL_MAJOR_VERSION', 'KFD_IOCTL_MINOR_VERSION',
'KFD_IOCTL_SVM_ATTR_ACCESS', 'KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE',
'KFD_IOCTL_SVM_ATTR_CLR_FLAGS', 'KFD_IOCTL_SVM_ATTR_GRANULARITY',
'KFD_IOCTL_SVM_ATTR_NO_ACCESS',
'KFD_IOCTL_SVM_ATTR_PREFERRED_LOC',
'KFD_IOCTL_SVM_ATTR_PREFETCH_LOC', 'KFD_IOCTL_SVM_ATTR_SET_FLAGS',
'KFD_IOCTL_SVM_FLAG_COHERENT', 'KFD_IOCTL_SVM_FLAG_GPU_EXEC',
'KFD_IOCTL_SVM_FLAG_GPU_READ_MOSTLY', 'KFD_IOCTL_SVM_FLAG_GPU_RO',
'KFD_IOCTL_SVM_FLAG_HIVE_LOCAL', 'KFD_IOCTL_SVM_FLAG_HOST_ACCESS',
'KFD_IOCTL_SVM_LOCATION_SYSMEM',
'KFD_IOCTL_SVM_LOCATION_UNDEFINED', 'KFD_IOCTL_SVM_OP_GET_ATTR',
'KFD_IOCTL_SVM_OP_SET_ATTR',
'KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM',
'KFD_IOC_ALLOC_MEM_FLAGS_COHERENT',
'KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL',
'KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE',
'KFD_IOC_ALLOC_MEM_FLAGS_GTT',
'KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP',
'KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE',
'KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC',
'KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED',
'KFD_IOC_ALLOC_MEM_FLAGS_USERPTR', 'KFD_IOC_ALLOC_MEM_FLAGS_VRAM',
'KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE',
'KFD_IOC_CACHE_POLICY_COHERENT',
'KFD_IOC_CACHE_POLICY_NONCOHERENT', 'KFD_IOC_EVENT_DEBUG_EVENT',
'KFD_IOC_EVENT_DEVICESTATECHANGE', 'KFD_IOC_EVENT_HW_EXCEPTION',
'KFD_IOC_EVENT_MEMORY', 'KFD_IOC_EVENT_NODECHANGE',
'KFD_IOC_EVENT_PROFILE_EVENT', 'KFD_IOC_EVENT_QUEUE_EVENT',
'KFD_IOC_EVENT_SIGNAL', 'KFD_IOC_EVENT_SYSTEM_EVENT',
'KFD_IOC_QUEUE_TYPE_COMPUTE', 'KFD_IOC_QUEUE_TYPE_COMPUTE_AQL',
'KFD_IOC_QUEUE_TYPE_SDMA', 'KFD_IOC_QUEUE_TYPE_SDMA_XGMI',
'KFD_IOC_WAIT_RESULT_COMPLETE', 'KFD_IOC_WAIT_RESULT_FAIL',
'KFD_IOC_WAIT_RESULT_TIMEOUT', 'KFD_MAX_QUEUE_PERCENTAGE',
'KFD_MAX_QUEUE_PRIORITY', 'KFD_MEM_ERR_GPU_HANG',
'KFD_MEM_ERR_NO_RAS', 'KFD_MEM_ERR_POISON_CONSUMED',
'KFD_MEM_ERR_SRAM_ECC', 'KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL',
'KFD_MMIO_REMAP_HDP_REG_FLUSH_CNTL', 'KFD_SIGNAL_EVENT_LIMIT',
'KFD_SMI_EVENT_GPU_POST_RESET', 'KFD_SMI_EVENT_GPU_PRE_RESET',
'KFD_SMI_EVENT_NONE', 'KFD_SMI_EVENT_THERMAL_THROTTLE',
'KFD_SMI_EVENT_VMFAULT', 'MAX_ALLOWED_AW_BUFF_SIZE',
'MAX_ALLOWED_NUM_POINTS', 'MAX_ALLOWED_WAC_BUFF_SIZE',
'NUM_OF_SUPPORTED_GPUS', 'kfd_ioctl_svm_attr_type',
'kfd_ioctl_svm_location', 'kfd_ioctl_svm_op', 'kfd_mmio_remap',
'kfd_smi_event', 'struct_kfd_event_data',
'struct_kfd_hsa_hw_exception_data',
'struct_kfd_hsa_memory_exception_data',
'struct_kfd_ioctl_acquire_vm_args',
'struct_kfd_ioctl_alloc_memory_of_gpu_args',
'struct_kfd_ioctl_alloc_queue_gws_args',
'struct_kfd_ioctl_create_event_args',
'struct_kfd_ioctl_create_queue_args',
'struct_kfd_ioctl_dbg_address_watch_args',
'struct_kfd_ioctl_dbg_register_args',
'struct_kfd_ioctl_dbg_unregister_args',
'struct_kfd_ioctl_dbg_wave_control_args',
'struct_kfd_ioctl_destroy_event_args',
'struct_kfd_ioctl_destroy_queue_args',
'struct_kfd_ioctl_free_memory_of_gpu_args',
'struct_kfd_ioctl_get_clock_counters_args',
'struct_kfd_ioctl_get_dmabuf_info_args',
'struct_kfd_ioctl_get_process_apertures_args',
'struct_kfd_ioctl_get_process_apertures_new_args',
'struct_kfd_ioctl_get_queue_wave_state_args',
'struct_kfd_ioctl_get_tile_config_args',
'struct_kfd_ioctl_get_version_args',
'struct_kfd_ioctl_import_dmabuf_args',
'struct_kfd_ioctl_map_memory_to_gpu_args',
'struct_kfd_ioctl_reset_event_args',
'struct_kfd_ioctl_set_cu_mask_args',
'struct_kfd_ioctl_set_event_args',
'struct_kfd_ioctl_set_memory_policy_args',
'struct_kfd_ioctl_set_scratch_backing_va_args',
'struct_kfd_ioctl_set_trap_handler_args',
'struct_kfd_ioctl_set_xnack_mode_args',
'struct_kfd_ioctl_smi_events_args', 'struct_kfd_ioctl_svm_args',
'struct_kfd_ioctl_svm_attribute',
'struct_kfd_ioctl_unmap_memory_from_gpu_args',
'struct_kfd_ioctl_update_queue_args',
'struct_kfd_ioctl_wait_events_args',
'struct_kfd_memory_exception_failure',
'struct_kfd_process_device_apertures', 'union_kfd_event_data_0']

333
tinygrad/runtime/ops_kfd.py Normal file
View File

@@ -0,0 +1,333 @@
from __future__ import annotations
from typing import Tuple
import os, fcntl, ctypes, functools, re, pathlib, mmap, struct
from tinygrad.device import Compiled, LRUAllocator, Compiler, BufferOptions, CompilerOptions
from tinygrad.helpers import getenv, from_mv, init_c_struct_t, to_mv, round_up
from tinygrad.renderer.cstyle import HIPRenderer
from tinygrad.runtime.driver.hip_comgr import compile_hip
import tinygrad.runtime.autogen.kfd as kfd
import tinygrad.runtime.autogen.hsa as hsa
import tinygrad.runtime.autogen.amd_sdma as amd_sdma
if getenv("IOCTL"): import extra.hip_gpu_driver.hip_ioctl # noqa: F401
libc = ctypes.CDLL("libc.so.6")
libc.mmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long]
libc.mmap.restype = ctypes.c_void_p
def node_sysfs_path(node_id, file): return f"/sys/devices/virtual/kfd/kfd/topology/nodes/{node_id}/{file}"
def kfd_ioctl(idir, nr, user_struct, fd, made_struct=None, **kwargs):
made = made_struct or user_struct(**kwargs)
ret = fcntl.ioctl(fd, (idir<<30) | (ctypes.sizeof(made)<<16) | (ord('K')<<8) | nr, made)
if ret != 0: raise RuntimeError(f"ioctl returned {ret}")
return made
def ioctls_from_header():
#hdr = pathlib.Path("/usr/include/linux/kfd_ioctl.h").read_text().replace("\\\n", "")
#pattern = r'#define\s+(AMDKFD_IOC_[A-Z0-9_]+)\s+AMDKFD_(IOW?R?)\((0x[0-9a-fA-F]+),\s+struct\s([A-Za-z0-9_]+)\)'
#matches = re.findall(pattern, hdr, re.MULTILINE)
# get this from python instead
hdrpy = (pathlib.Path(__file__).parent / "autogen" / "kfd.py").read_text()
pattern = r'# (AMDKFD_IOC_[A-Z0-9_]+)\s=\s_(IOW?R?).*\(( 0x[0-9a-fA-F]+) ,\s+struct\s([A-Za-z0-9_]+)\s+\)'
matches = re.findall(pattern, hdrpy, re.MULTILINE)
idirs = {"IOW": 1, "IOR": 2, "IOWR": 3}
fxns = {name.replace("AMDKFD_IOC_", "").lower():
functools.partial(kfd_ioctl, idirs[idir], int(nr, 0x10), getattr(kfd, "struct_"+sname))
for name, idir, nr, sname in matches}
return type("KIO", (object, ), fxns)
kio = ioctls_from_header()
def create_sdma_packets():
# TODO: clean up this, if we want to keep it
structs = {}
for name,pkt in [(name,s) for name,s in amd_sdma.__dict__.items() if name.startswith("struct_SDMA_PKT_") and name.endswith("_TAG")]:
names = set()
fields = []
for pkt_fields in pkt._fields_:
if not pkt_fields[0].endswith("_UNION"): fields.append(pkt_fields)
else:
assert pkt_fields[1]._fields_[0][0] == '_0'
for union_fields in pkt_fields[1]._fields_[0][1]._fields_:
fname = union_fields[0]
if fname in names: fname = pkt_fields[0]+fname
names.add(fname)
if fname.endswith("_63_32") and fields[-1][0].endswith("_31_0"):
fields[-1] = tuple([fname[:-6], ctypes.c_ulong, 64]) # merge together 64-bit fields
else:
fields.append(tuple([fname, *union_fields[1:]]))
new_name = name[16:-4].lower()
structs[new_name] = init_c_struct_t(tuple(fields))
assert ctypes.sizeof(structs[new_name]) == ctypes.sizeof(pkt), f"{ctypes.sizeof(structs[new_name])} != {ctypes.sizeof(pkt)}"
return type("SDMA_PKTS", (object, ), structs)
sdma_pkts = create_sdma_packets()
class KFDCompiler(Compiler):
compiler_opts = CompilerOptions("KFD", has_tensor_cores=True, shared_max=65536)
def __init__(self, arch:str):
self.arch = arch
super().__init__(f"compile_hip_{self.arch}")
def render(self, name:str, uops) -> str: return HIPRenderer(name, uops)
def compile(self, src:str) -> bytes: return compile_hip(src, self.arch)
AQL_PACKET_SIZE = ctypes.sizeof(hsa.hsa_kernel_dispatch_packet_t)
SDMA_MAX_COPY_SIZE = 0x400000
DISPATCH_KERNEL_SETUP = 3 << hsa.HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS
DISPATCH_KERNEL_HEADER = 1 << hsa.HSA_PACKET_HEADER_BARRIER
DISPATCH_KERNEL_HEADER |= hsa.HSA_FENCE_SCOPE_SYSTEM << hsa.HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE
DISPATCH_KERNEL_HEADER |= hsa.HSA_FENCE_SCOPE_SYSTEM << hsa.HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE
DISPATCH_KERNEL_HEADER |= hsa.HSA_PACKET_TYPE_KERNEL_DISPATCH << hsa.HSA_PACKET_HEADER_TYPE
SHT_PROGBITS = 0x1
SHF_ALLOC = 0x2
class KFDProgram:
def __init__(self, device:KFDDevice, name:str, lib:bytes):
# TODO; this API needs the type signature of the function and global_size/local_size
self.device, self.name, self.lib = device, name, lib
_phoff, _shoff, _flags, _ehsize, _phentsize, _phnum, _shentsize, _shnum, _shstrndx = struct.unpack_from("<QQIHHHHHH", self.lib, 0x20)
sections = [struct.unpack_from("<IIQQQQIIQ", self.lib, _shoff + i * _shentsize) for i in range(_shnum)]
lib_gpu_size = round_up(max(sh[5]+sh[3] for sh in sections if sh[1] == SHT_PROGBITS), 0x1000)
self.lib_gpu = self.device._gpu_alloc(lib_gpu_size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM, public=True)
lib_gpu_view = to_mv(self.lib_gpu.va_addr, lib_gpu_size)
for _, sh_type, sh_flags, sh_addr, sh_offset, sh_size, _, _, _ in sections:
if sh_type == SHT_PROGBITS and sh_flags & SHF_ALLOC: lib_gpu_view[sh_addr:sh_addr+sh_size] = self.lib[sh_offset:sh_offset+sh_size]
entry_point = min(sh[3] for sh in sections if sh[1] == SHT_PROGBITS and sh[2] & SHF_ALLOC)
self.handle = self.lib_gpu.va_addr + entry_point
self.group_segment_size = lib_gpu_view.cast("I")[entry_point//4]
self.private_segment_size = lib_gpu_view.cast("I")[entry_point//4 + 1]
self.kernargs_segment_size = lib_gpu_view.cast("I")[entry_point//4 + 2]
assert self.private_segment_size <= self.device.max_private_segment_size, \
f"{self.private_segment_size=} > {self.device.max_private_segment_size=}"
# NOTE: no programs are ever freed
def __del__(self): kio.free_memory_of_gpu(KFDDevice.kfd, handle=self.lib_gpu.handle)
def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
if not hasattr(self, "args_struct_t"):
self.args_struct_t = init_c_struct_t(tuple([(f'f{i}', ctypes.c_void_p) for i in range(len(args))] +
[(f'v{i}', ctypes.c_int) for i in range(len(vals))]))
if ctypes.sizeof(self.args_struct_t) != self.kernargs_segment_size:
raise RuntimeError(f"HSAProgram.__call__: incorrect args struct size {ctypes.sizeof(self.args_struct_t)} != {self.kernargs_segment_size}")
args_st = self.args_struct_t.from_address(self.device.kernargs.va_addr)
for i in range(len(args)): args_st.__setattr__(f'f{i}', args[i].va_addr)
for i in range(len(vals)): args_st.__setattr__(f'v{i}', vals[i])
self.device.completion_signal.value = 1 # reset the signal before call
packet = hsa.hsa_kernel_dispatch_packet_t.from_address(self.device.aql_ring.va_addr +
(self.device.aql_doorbell_value*AQL_PACKET_SIZE) % self.device.aql_ring.size)
packet.workgroup_size_x, packet.workgroup_size_y, packet.workgroup_size_z = local_size
packet.reserved0 = 0
packet.grid_size_x, packet.grid_size_y, packet.grid_size_z = tuple(g*l for g,l in zip(global_size, local_size))
packet.kernel_object = self.handle
packet.kernarg_address = self.device.kernargs.va_addr
packet.group_segment_size = self.group_segment_size
packet.private_segment_size = self.private_segment_size # what it this and why doesn't it work? (see TestOps.test_dilated_conv_transpose2d)
packet.reserved2 = 0
packet.completion_signal = hsa.hsa_signal_t(ctypes.addressof(self.device.completion_signal))
packet.setup = DISPATCH_KERNEL_SETUP
packet.header = DISPATCH_KERNEL_HEADER
# one pending packet + ring doorbell
self.device.amd_aql_queue.write_dispatch_id = self.device.aql_doorbell_value + 1
self.device.aql_doorbell[0] = self.device.aql_doorbell_value
self.device.aql_doorbell_value += 1
evt_arr = (kfd.struct_kfd_event_data * 1)()
evt_arr[0].event_id = self.device.completion_signal.event_id
kio.wait_events(KFDDevice.kfd, events_ptr=ctypes.addressof(evt_arr), num_events=1, wait_for_all=1, timeout=1000)
assert (wp:=self.device.amd_aql_queue.write_dispatch_id) == (rp:=self.device.amd_aql_queue.read_dispatch_id), f"didn't run {wp} != {rp}"
if wait: return (self.device.completion_signal.end_ts-self.device.completion_signal.start_ts)/1e9
class KFDAllocator(LRUAllocator):
def __init__(self, device:KFDDevice):
self.device = device
super().__init__()
def _alloc(self, size:int, options:BufferOptions):
if options.host: return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, public=True)
else: return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM, public=True)
def copyin(self, dest, src: memoryview):
# TODO: need to make the address visible to gpu and pass it directly to sdma.
self.device._map_userptr_to_gpu(ctypes.addressof(from_mv(src).contents), src.nbytes)
self.device.completion_signal.value = 1
self.device._submit_sdma(dest.va_addr, ctypes.addressof(from_mv(src).contents), src.nbytes, completion_signal=self.device.completion_signal)
evt_arr = (kfd.struct_kfd_event_data * 1)()
evt_arr[0].event_id = self.device.completion_signal.event_id
kio.wait_events(KFDDevice.kfd, events_ptr=ctypes.addressof(evt_arr), num_events=1, wait_for_all=1, timeout=1000)
def copyout(self, dest:memoryview, src):
self.device._map_userptr_to_gpu(ctypes.addressof(from_mv(dest).contents), dest.nbytes)
self.device.completion_signal.value = 1
self.device._submit_sdma(ctypes.addressof(from_mv(dest).contents), src.va_addr, dest.nbytes, completion_signal=self.device.completion_signal)
evt_arr = (kfd.struct_kfd_event_data * 1)()
evt_arr[0].event_id = self.device.completion_signal.event_id
kio.wait_events(KFDDevice.kfd, events_ptr=ctypes.addressof(evt_arr), num_events=1, wait_for_all=1, timeout=1000)
MAP_FIXED, MAP_NORESERVE = 0x10, 0x400
class KFDDevice(Compiled):
kfd:int = -1
def _map_userptr_to_gpu(self, addr, size):
self.map_uptr2gpu_struct.start_addr = addr&~0xfff
self.map_uptr2gpu_struct.size = round_up(size+addr-(addr&~0xfff), 0x1000)
kio.svm(self.kfd, made_struct=self.map_uptr2gpu_struct)
def _gpu_alloc(self, size:int, flags:int, uncached=False, public=False, map_to_gpu=True):
flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE
if uncached: flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_COHERENT | kfd.KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED
if public: flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC
if flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR:
buf = addr = libc.mmap(0, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED|mmap.MAP_ANONYMOUS, -1, 0)
else:
buf, addr = 0, libc.mmap(0, size, 0, mmap.MAP_PRIVATE|mmap.MAP_ANONYMOUS|MAP_NORESERVE, -1, 0)
assert addr != 0xffffffffffffffff
mem = kio.alloc_memory_of_gpu(self.kfd, va_addr=addr, size=size, gpu_id=self.gpu_id, flags=flags, mmap_offset=buf)
if not (flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR):
buf = libc.mmap(mem.va_addr, mem.size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED|MAP_FIXED, self.drm_fd, mem.mmap_offset)
assert buf != 0xffffffffffffffff
assert addr == buf == mem.va_addr
if map_to_gpu:
arr = (ctypes.c_int32 * 1)(self.gpu_id)
stm = kio.map_memory_to_gpu(self.kfd, handle=mem.handle, device_ids_array_ptr=ctypes.addressof(arr), n_devices=1)
assert stm.n_success == 1
return mem
def __init__(self, device:str=""):
if KFDDevice.kfd == -1: KFDDevice.kfd = os.open("/dev/kfd", os.O_RDWR)
self.device_id = int(device.split(":")[1]) if ":" in device else 0
with open(node_sysfs_path(self.device_id+1, "gpu_id"), "r") as f: self.gpu_id = int(f.read())
with open(node_sysfs_path(self.device_id+1, "properties"), "r") as f: self.properties = {line.split()[0]: int(line.split()[1]) for line in f}
self.drm_fd = os.open(f"/dev/dri/renderD{self.properties['drm_render_minor']}", os.O_RDWR)
self.arch = f"gfx{self.properties['gfx_target_version']//100}"
kio.acquire_vm(KFDDevice.kfd, drm_fd=self.drm_fd, gpu_id=self.gpu_id)
self.event_page = self._gpu_alloc(0x8000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
self.sync_event = kio.create_event(KFDDevice.kfd, event_page_offset=self.event_page.handle, auto_reset=1)
self.eop_buffer = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
self.aql_ring = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, uncached=True)
self.signals_page = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, uncached=True)
self.gart = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
self.kernargs = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
self.ctx_save_restore_address = self._gpu_alloc(0x2C02000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
self.completion_signal = hsa.amd_signal_t.from_address(self.signals_page.va_addr)
self.completion_signal.value = 1
self.completion_signal.kind = hsa.AMD_SIGNAL_KIND_USER
self.completion_signal.event_mailbox_ptr = self.event_page.va_addr + self.sync_event.event_slot_index*8
self.completion_signal.event_id = self.sync_event.event_id
# AQL Queue
self.amd_aql_queue = hsa.amd_queue_t.from_address(self.gart.va_addr)
self.amd_aql_queue.write_dispatch_id = 0
self.amd_aql_queue.read_dispatch_id = 0
self.amd_aql_queue.read_dispatch_id_field_base_byte_offset = getattr(hsa.amd_queue_t, 'read_dispatch_id').offset
self.amd_aql_queue.queue_properties = hsa.AMD_QUEUE_PROPERTIES_IS_PTR64 | hsa.AMD_QUEUE_PROPERTIES_ENABLE_PROFILING
self.amd_aql_queue.max_cu_id = self.properties['simd_count'] // self.properties['simd_per_cu'] - 1
self.amd_aql_queue.max_wave_id = self.properties['max_waves_per_simd'] * self.properties['simd_per_cu'] - 1
# scratch setup
self.max_private_segment_size = 256
self.scratch_len = self.max_private_segment_size * (self.amd_aql_queue.max_cu_id + 1) * (self.amd_aql_queue.max_wave_id + 1)
self.scratch = self._gpu_alloc(self.scratch_len, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
self.amd_aql_queue.scratch_backing_memory_location = self.scratch.va_addr
self.amd_aql_queue.scratch_backing_memory_byte_size = self.scratch_len
self.amd_aql_queue.scratch_wave64_lane_byte_size = self.max_private_segment_size * (self.amd_aql_queue.max_wave_id + 1) // 64
self.amd_aql_queue.scratch_resource_descriptor[0] = self.scratch.va_addr & 0xFFFFFFFF
self.amd_aql_queue.scratch_resource_descriptor[1] = ((self.scratch.va_addr >> 32) & 0xFFFF) | (1 << 30) # va_hi | SWIZZLE_ENABLE
self.amd_aql_queue.scratch_resource_descriptor[2] = self.scratch_len & 0xFFFFFFFF
self.amd_aql_queue.scratch_resource_descriptor[3] = 0x20814fac # FORMAT=BUF_FORMAT_32_UINT,OOB_SELECT=2,ADD_TID_ENABLE=1,TYPE=SQ_RSRC_BUF,SQ_SELs
wave_scratch = (((self.amd_aql_queue.max_wave_id + 1) * self.max_private_segment_size + 255) // 256)
self.amd_aql_queue.compute_tmpring_size = wave_scratch << 12 | (self.amd_aql_queue.max_cu_id + 1)
self.aql_queue = kio.create_queue(KFDDevice.kfd, ring_base_address=self.aql_ring.va_addr, ring_size=self.aql_ring.size, gpu_id=self.gpu_id,
queue_type=kfd.KFD_IOC_QUEUE_TYPE_COMPUTE_AQL, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
eop_buffer_address=self.eop_buffer.va_addr, eop_buffer_size=self.eop_buffer.size,
ctx_save_restore_address=self.ctx_save_restore_address.va_addr, ctx_save_restore_size=self.ctx_save_restore_address.size,
ctl_stack_size = 0xa000,
write_pointer_address=self.gart.va_addr + getattr(hsa.amd_queue_t, 'write_dispatch_id').offset,
read_pointer_address=self.gart.va_addr + getattr(hsa.amd_queue_t, 'read_dispatch_id').offset)
self.doorbells_base = self.aql_queue.doorbell_offset & (~0xfff)
self.doorbells = libc.mmap(0, 8192, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, KFDDevice.kfd, self.doorbells_base)
self.aql_doorbell = to_mv(self.doorbells + self.aql_queue.doorbell_offset - self.doorbells_base, 4).cast("I")
self.aql_doorbell_value = 0
# SDMA Queue
self.sdma_ring = self._gpu_alloc(1 << 20, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, uncached=True)
self.sdma_queue = kio.create_queue(KFDDevice.kfd, ring_base_address=self.sdma_ring.va_addr, ring_size=self.sdma_ring.size, gpu_id=self.gpu_id,
queue_type=kfd.KFD_IOC_QUEUE_TYPE_SDMA, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
write_pointer_address=self.gart.va_addr + 0x100, read_pointer_address=self.gart.va_addr + 0x108)
self.sdma_read_pointer = to_mv(self.sdma_queue.read_pointer_address, 8).cast("Q")
self.sdma_write_pointer = to_mv(self.sdma_queue.write_pointer_address, 8).cast("Q")
self.sdma_doorbell = to_mv(self.doorbells + self.sdma_queue.doorbell_offset - self.doorbells_base, 4).cast("I")
self.sdma_doorbell_value = 0
# prebuilt packets
self.sdma_flush_hdp_pkt = sdma_pkts.hdp_flush(0x8, 0x0, 0x80000000, 0x0, 0x0, 0x0)
self.sdma_cache_inv = sdma_pkts.gcr(op=amd_sdma.SDMA_OP_GCR, sub_op=amd_sdma.SDMA_SUBOP_USER_GCR, GCR_CONTROL_GL2_WB=1, GCR_CONTROL_GLK_WB=1,
GCR_CONTROL_GL2_INV=1, GCR_CONTROL_GL1_INV=1, GCR_CONTROL_GLV_INV=1, GCR_CONTROL_GLK_INV=1,
GCR_CONTROL_GL2_RANGE=0)
self.sdma_cache_wb = sdma_pkts.gcr(op=amd_sdma.SDMA_OP_GCR, sub_op=amd_sdma.SDMA_SUBOP_USER_GCR, GCR_CONTROL_GL2_WB=1, GCR_CONTROL_GLK_WB=1,
GCR_CONTROL_GL2_RANGE=0)
# Helpers
map_uptr2gpu_struct_t = init_c_struct_t(tuple(kfd.struct_kfd_ioctl_svm_args._fields_[:-1]+[('attrs', kfd.struct_kfd_ioctl_svm_attribute*2)])) # type: ignore
self.map_uptr2gpu_struct = map_uptr2gpu_struct_t(nattr=2, op=0x0)
self.map_uptr2gpu_struct.attrs[0].type = kfd.KFD_IOCTL_SVM_ATTR_SET_FLAGS
self.map_uptr2gpu_struct.attrs[0].value = kfd.KFD_IOCTL_SVM_FLAG_COHERENT
self.map_uptr2gpu_struct.attrs[1].type = kfd.KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE
self.map_uptr2gpu_struct.attrs[1].value = self.gpu_id
super().__init__(device, KFDAllocator(self), KFDCompiler(self.arch), functools.partial(KFDProgram, self))
def _submit_sdma(self, dest, src, copy_size, wait_signals=None, completion_signal=None):
def blit_sdma_command(cmd):
ctypes.memmove(self.sdma_ring.va_addr + (self.sdma_doorbell_value % self.sdma_ring.size), ctypes.addressof(cmd), sz:=ctypes.sizeof(cmd))
self.sdma_doorbell_value += sz
if wait_signals is not None:
# NOTE: we check only low 32 bits to be zeroed, we don't use higher values for signals
for sig in wait_signals:
poll_addr = ctypes.addressof(sig) + getattr(hsa.amd_signal_t, 'value').offset
blit_sdma_command(sdma_pkts.poll_regmem(op=amd_sdma.SDMA_OP_POLL_REGMEM, mem_poll=1, func=0x3, addr=poll_addr,
value=0, mask=0xffffffff, interval=0x04, retry_count=0xfff))
if completion_signal is not None:
blit_sdma_command(sdma_pkts.timestamp(op=amd_sdma.SDMA_OP_TIMESTAMP, sub_op=amd_sdma.SDMA_SUBOP_TIMESTAMP_GET_GLOBAL,
addr=ctypes.addressof(completion_signal) + getattr(hsa.amd_signal_t, 'start_ts').offset))
blit_sdma_command(self.sdma_flush_hdp_pkt)
blit_sdma_command(self.sdma_cache_inv)
copied = 0
copies_commands = (copy_size + SDMA_MAX_COPY_SIZE - 1) // SDMA_MAX_COPY_SIZE
for _ in range(copies_commands):
step_copy_size = min(copy_size - copied, SDMA_MAX_COPY_SIZE)
blit_sdma_command(sdma_pkts.copy_linear(op=amd_sdma.SDMA_OP_COPY, sub_op=amd_sdma.SDMA_SUBOP_COPY_LINEAR,
count=step_copy_size-1, src_addr=src+copied, dst_addr=dest+copied))
copied += step_copy_size
blit_sdma_command(self.sdma_cache_wb)
if completion_signal is not None:
blit_sdma_command(sdma_pkts.timestamp(op=amd_sdma.SDMA_OP_TIMESTAMP, sub_op=amd_sdma.SDMA_SUBOP_TIMESTAMP_GET_GLOBAL,
addr=ctypes.addressof(completion_signal) + getattr(hsa.amd_signal_t, 'end_ts').offset))
if completion_signal is not None:
signal_addr = ctypes.addressof(completion_signal) + getattr(hsa.amd_signal_t, 'value').offset
blit_sdma_command(sdma_pkts.atomic(op=amd_sdma.SDMA_OP_ATOMIC, operation=amd_sdma.SDMA_ATOMIC_ADD64, addr=signal_addr, src_data=(1<<64)-1))
if completion_signal.event_mailbox_ptr != 0:
blit_sdma_command(sdma_pkts.fence(op=amd_sdma.SDMA_OP_FENCE, mtype=3, addr=completion_signal.event_mailbox_ptr,
data=completion_signal.event_id))
blit_sdma_command(sdma_pkts.trap(op=amd_sdma.SDMA_OP_TRAP, int_ctx=completion_signal.event_id))
self.sdma_write_pointer[0] = self.sdma_doorbell_value
self.sdma_doorbell[0] = self.sdma_doorbell_value