mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-06-11 23:46:02 +08:00
kfd driver wip (#3912)
* kfd driver wip * cleanups * kfd almost ready to ring doorbell * ding dong? * issues with signals * something * works * ops kfd * add amd_signal_t * works...sometimes * program runs * _gpu_alloc cleanup * cleanups * work * header + enable profiling (#3959) * header + enable profiling * just cleaner * measure * only local time domain * remove old comments * fix with master * elf parsing (#3965) * elf parsing * fix kernels with private * not used * clean up * clean up 2 * add flags * kfd sdma (#3970) * working sdma * remove driver, shorter * all commands we might need * svm * kfd remove hardcoded values (#4007) * remove hardcoded values * match above line * 7k lines + revert hsa * update that from origin * fix sdma reg gen * not the updated SDMA * compiler_opts * don't require kfd_ioctl * get ioctls from python * get ioctls from python * remove build_sdma_command * merge into 64-bit fields * shorter * fix property spelling and off by one --------- Co-authored-by: nimlgen <138685161+nimlgen@users.noreply.github.com>
This commit is contained in:
4
.github/workflows/test.yml
vendored
4
.github/workflows/test.yml
vendored
@@ -112,8 +112,8 @@ jobs:
|
||||
python -c "from tinygrad.tensor import Tensor; print(Tensor([1,2,3,4,5]))"
|
||||
- name: Test DEBUG
|
||||
run: DEBUG=100 python3 -c "from tinygrad import Tensor; N = 1024; a, b = Tensor.rand(N, N), Tensor.rand(N, N); c = (a.reshape(N, 1, N) * b.T.reshape(1, N, N)).sum(axis=2); print((c.numpy() - (a.numpy() @ b.numpy())).mean())"
|
||||
- name: Repo line count <6500 lines
|
||||
run: MAX_LINE_COUNT=6500 python sz.py
|
||||
- name: Repo line count <7000 lines
|
||||
run: MAX_LINE_COUNT=7000 python sz.py
|
||||
|
||||
testcpuimagenet:
|
||||
name: ImageNet to C Tests
|
||||
|
||||
@@ -56,6 +56,13 @@ generate_comgr() {
|
||||
python3 -c "import tinygrad.runtime.autogen.comgr"
|
||||
}
|
||||
|
||||
generate_kfd() {
|
||||
clang2py /usr/include/linux/kfd_ioctl.h -o $BASE/kfd.py -k cdefstum
|
||||
fixup $BASE/kfd.py
|
||||
sed -i "s\import ctypes\import ctypes, os\g" $BASE/kfd.py
|
||||
python3 -c "import tinygrad.runtime.autogen.kfd"
|
||||
}
|
||||
|
||||
generate_cuda() {
|
||||
clang2py /usr/include/cuda.h /usr/include/nvrtc.h -o $BASE/cuda.py -l /usr/lib/x86_64-linux-gnu/libcuda.so -l /usr/lib/x86_64-linux-gnu/libnvrtc.so
|
||||
sed -i "s\import ctypes\import ctypes, ctypes.util\g" $BASE/cuda.py
|
||||
@@ -69,10 +76,19 @@ generate_hsa() {
|
||||
clang2py \
|
||||
/opt/rocm/include/hsa/hsa.h \
|
||||
/opt/rocm/include/hsa/hsa_ext_amd.h \
|
||||
/opt/rocm/include/hsa/amd_hsa_signal.h \
|
||||
/opt/rocm/include/hsa/amd_hsa_queue.h \
|
||||
/opt/rocm/include/hsa/hsa_ext_finalize.h /opt/rocm/include/hsa/hsa_ext_image.h \
|
||||
--clang-args="-I/opt/rocm/include" \
|
||||
-o $BASE/hsa.py -l /opt/rocm/lib/libhsa-runtime64.so
|
||||
|
||||
# clang2py broken when pass -x c++ to prev headers
|
||||
clang2py extra/hip_gpu_driver/sdma_registers.h \
|
||||
--clang-args="-I/opt/rocm/include -x c++" \
|
||||
-o $BASE/amd_sdma.py -l /opt/rocm/lib/libhsa-runtime64.so
|
||||
|
||||
fixup $BASE/hsa.py
|
||||
fixup $BASE/amd_sdma.py
|
||||
sed -i "s\import ctypes\import ctypes, os\g" $BASE/hsa.py
|
||||
sed -i "s\'/opt/rocm/\os.getenv('ROCM_PATH', '/opt/rocm/')+'/\g" $BASE/hsa.py
|
||||
python3 -c "import tinygrad.runtime.autogen.hsa"
|
||||
@@ -83,6 +99,7 @@ elif [ "$1" == "hip" ]; then generate_hip
|
||||
elif [ "$1" == "comgr" ]; then generate_comgr
|
||||
elif [ "$1" == "cuda" ]; then generate_cuda
|
||||
elif [ "$1" == "hsa" ]; then generate_hsa
|
||||
elif [ "$1" == "all" ]; then generate_opencl; generate_hip; generate_comgr; generate_cuda; generate_hsa
|
||||
elif [ "$1" == "kfd" ]; then generate_kfd
|
||||
elif [ "$1" == "all" ]; then generate_opencl; generate_hip; generate_comgr; generate_cuda; generate_hsa; generate_kfd
|
||||
else echo "usage: $0 <type>"
|
||||
fi
|
||||
|
||||
@@ -44,10 +44,9 @@ def install_hook(c_function, python_function):
|
||||
|
||||
# *** ioctl lib end ***
|
||||
|
||||
# clang2py kfd_ioctl.h -o kfd_ioctl.py
|
||||
from extra.hip_gpu_driver import kfd_ioctl
|
||||
import tinygrad.runtime.autogen.kfd as kfd_ioctl
|
||||
def ioctls_from_header():
|
||||
hdr = (pathlib.Path(__file__).parent.parent.parent / "extra/hip_gpu_driver/kfd_ioctl.h").read_text().replace("\\\n", "")
|
||||
hdr = pathlib.Path("/usr/include/linux/kfd_ioctl.h").read_text().replace("\\\n", "")
|
||||
pattern = r'#define\s+(AMDKFD_IOC_[A-Z0-9_]+)\s+AMDKFD_IOW?R?\((0x[0-9a-fA-F]+),\s+struct\s([A-Za-z0-9_]+)\)'
|
||||
matches = re.findall(pattern, hdr, re.MULTILINE)
|
||||
return {int(nr, 0x10):(name, getattr(kfd_ioctl, "struct_"+sname)) for name, nr, sname in matches}
|
||||
@@ -68,7 +67,8 @@ def ioctl(fd, request, argp):
|
||||
out = ctypes.cast(s.attrs, ctypes.POINTER(kfd_ioctl.struct_kfd_ioctl_svm_attribute))
|
||||
for i in range(s.nattr): print(f"{i}: {kfd_ioctl.kfd_ioctl_svm_attr_type__enumvalues[out[i].type]:40s}: {out[i].value:#x}")
|
||||
else:
|
||||
print("ioctl", f"{idir=} {size=} {itype=} {nr=} {fd=} {ret=}", os.readlink(f"/proc/self/fd/{fd}") if fd >= 0 else "")
|
||||
print(f"{(st-start)*1000:7.2f} ms +{et*1000.:7.2f} ms : ioctl",
|
||||
f"{idir=} {size=} {itype=} {nr=} {fd=} {ret=}", os.readlink(f"/proc/self/fd/{fd}") if fd >= 0 else "")
|
||||
return ret
|
||||
|
||||
install_hook(libc.ioctl, ioctl)
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
1
extra/hip_gpu_driver/kfd_ioctl.h
Symbolic link
1
extra/hip_gpu_driver/kfd_ioctl.h
Symbolic link
@@ -0,0 +1 @@
|
||||
/usr/include/linux/kfd_ioctl.h
|
||||
File diff suppressed because it is too large
Load Diff
571
extra/hip_gpu_driver/sdma_registers.h
Normal file
571
extra/hip_gpu_driver/sdma_registers.h
Normal file
@@ -0,0 +1,571 @@
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// The University of Illinois/NCSA
|
||||
// Open Source License (NCSA)
|
||||
//
|
||||
// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Developed by:
|
||||
//
|
||||
// AMD Research and AMD HSA Software Development
|
||||
//
|
||||
// Advanced Micro Devices, Inc.
|
||||
//
|
||||
// www.amd.com
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to
|
||||
// deal with the Software without restriction, including without limitation
|
||||
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
// and/or sell copies of the Software, and to permit persons to whom the
|
||||
// Software is furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// - Redistributions of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimers.
|
||||
// - Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimers in
|
||||
// the documentation and/or other materials provided with the distribution.
|
||||
// - Neither the names of Advanced Micro Devices, Inc,
|
||||
// nor the names of its contributors may be used to endorse or promote
|
||||
// products derived from this Software without specific prior written
|
||||
// permission.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
// DEALINGS WITH THE SOFTWARE.
|
||||
//
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef HSA_RUNTIME_CORE_INC_SDMA_REGISTERS_H_
|
||||
#define HSA_RUNTIME_CORE_INC_SDMA_REGISTERS_H_
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
namespace rocr {
|
||||
namespace AMD {
|
||||
|
||||
// SDMA packet for VI device.
|
||||
// Reference: http://people.freedesktop.org/~agd5f/dma_packets.txt
|
||||
|
||||
const unsigned int SDMA_OP_COPY = 1;
|
||||
const unsigned int SDMA_OP_FENCE = 5;
|
||||
const unsigned int SDMA_OP_TRAP = 6;
|
||||
const unsigned int SDMA_OP_POLL_REGMEM = 8;
|
||||
const unsigned int SDMA_OP_ATOMIC = 10;
|
||||
const unsigned int SDMA_OP_CONST_FILL = 11;
|
||||
const unsigned int SDMA_OP_TIMESTAMP = 13;
|
||||
const unsigned int SDMA_OP_GCR = 17;
|
||||
const unsigned int SDMA_SUBOP_COPY_LINEAR = 0;
|
||||
const unsigned int SDMA_SUBOP_COPY_LINEAR_RECT = 4;
|
||||
const unsigned int SDMA_SUBOP_TIMESTAMP_GET_GLOBAL = 2;
|
||||
const unsigned int SDMA_SUBOP_USER_GCR = 1;
|
||||
const unsigned int SDMA_ATOMIC_ADD64 = 47;
|
||||
|
||||
typedef struct SDMA_PKT_COPY_LINEAR_TAG {
|
||||
union {
|
||||
struct {
|
||||
unsigned int op : 8;
|
||||
unsigned int sub_op : 8;
|
||||
unsigned int extra_info : 16;
|
||||
};
|
||||
unsigned int DW_0_DATA;
|
||||
} HEADER_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int count : 22;
|
||||
unsigned int reserved_0 : 10;
|
||||
};
|
||||
unsigned int DW_1_DATA;
|
||||
} COUNT_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int reserved_0 : 16;
|
||||
unsigned int dst_swap : 2;
|
||||
unsigned int reserved_1 : 6;
|
||||
unsigned int src_swap : 2;
|
||||
unsigned int reserved_2 : 6;
|
||||
};
|
||||
unsigned int DW_2_DATA;
|
||||
} PARAMETER_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int src_addr_31_0 : 32;
|
||||
};
|
||||
unsigned int DW_3_DATA;
|
||||
} SRC_ADDR_LO_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int src_addr_63_32 : 32;
|
||||
};
|
||||
unsigned int DW_4_DATA;
|
||||
} SRC_ADDR_HI_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int dst_addr_31_0 : 32;
|
||||
};
|
||||
unsigned int DW_5_DATA;
|
||||
} DST_ADDR_LO_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int dst_addr_63_32 : 32;
|
||||
};
|
||||
unsigned int DW_6_DATA;
|
||||
} DST_ADDR_HI_UNION;
|
||||
|
||||
static const size_t kMaxSize_ = 0x3fffe0;
|
||||
} SDMA_PKT_COPY_LINEAR;
|
||||
|
||||
// linear sub-window
|
||||
typedef struct SDMA_PKT_COPY_LINEAR_RECT_TAG {
|
||||
static const unsigned int pitch_bits = 19;
|
||||
static const unsigned int slice_bits = 28;
|
||||
static const unsigned int rect_xy_bits = 14;
|
||||
static const unsigned int rect_z_bits = 11;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int op : 8;
|
||||
unsigned int sub_op : 8;
|
||||
unsigned int reserved : 13;
|
||||
unsigned int element : 3;
|
||||
};
|
||||
unsigned int DW_0_DATA;
|
||||
} HEADER_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int src_addr_31_0 : 32;
|
||||
};
|
||||
unsigned int DW_1_DATA;
|
||||
} SRC_ADDR_LO_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int src_addr_63_32 : 32;
|
||||
};
|
||||
unsigned int DW_2_DATA;
|
||||
} SRC_ADDR_HI_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int src_offset_x : 14;
|
||||
unsigned int reserved_1 : 2;
|
||||
unsigned int src_offset_y : 14;
|
||||
unsigned int reserved_2 : 2;
|
||||
};
|
||||
unsigned int DW_3_DATA;
|
||||
} SRC_PARAMETER_1_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int src_offset_z : 11;
|
||||
unsigned int reserved_1 : 2;
|
||||
unsigned int src_pitch : pitch_bits;
|
||||
};
|
||||
unsigned int DW_4_DATA;
|
||||
} SRC_PARAMETER_2_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int src_slice_pitch : slice_bits;
|
||||
unsigned int reserved_1 : 4;
|
||||
};
|
||||
unsigned int DW_5_DATA;
|
||||
} SRC_PARAMETER_3_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int dst_addr_31_0 : 32;
|
||||
};
|
||||
unsigned int DW_6_DATA;
|
||||
} DST_ADDR_LO_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int dst_addr_63_32 : 32;
|
||||
};
|
||||
unsigned int DW_7_DATA;
|
||||
} DST_ADDR_HI_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int dst_offset_x : 14;
|
||||
unsigned int reserved_1 : 2;
|
||||
unsigned int dst_offset_y : 14;
|
||||
unsigned int reserved_2 : 2;
|
||||
};
|
||||
unsigned int DW_8_DATA;
|
||||
} DST_PARAMETER_1_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int dst_offset_z : 11;
|
||||
unsigned int reserved_1 : 2;
|
||||
unsigned int dst_pitch : pitch_bits;
|
||||
};
|
||||
unsigned int DW_9_DATA;
|
||||
} DST_PARAMETER_2_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int dst_slice_pitch : slice_bits;
|
||||
unsigned int reserved_1 : 4;
|
||||
};
|
||||
unsigned int DW_10_DATA;
|
||||
} DST_PARAMETER_3_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int rect_x : rect_xy_bits;
|
||||
unsigned int reserved_1 : 2;
|
||||
unsigned int rect_y : rect_xy_bits;
|
||||
unsigned int reserved_2 : 2;
|
||||
};
|
||||
unsigned int DW_11_DATA;
|
||||
} RECT_PARAMETER_1_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int rect_z : rect_z_bits;
|
||||
unsigned int reserved_1 : 5;
|
||||
unsigned int dst_swap : 2;
|
||||
unsigned int reserved_2 : 6;
|
||||
unsigned int src_swap : 2;
|
||||
unsigned int reserved_3 : 6;
|
||||
};
|
||||
unsigned int DW_12_DATA;
|
||||
} RECT_PARAMETER_2_UNION;
|
||||
|
||||
} SDMA_PKT_COPY_LINEAR_RECT;
|
||||
|
||||
typedef struct SDMA_PKT_CONSTANT_FILL_TAG {
|
||||
union {
|
||||
struct {
|
||||
unsigned int op : 8;
|
||||
unsigned int sub_op : 8;
|
||||
unsigned int sw : 2;
|
||||
unsigned int reserved_0 : 12;
|
||||
unsigned int fillsize : 2;
|
||||
};
|
||||
unsigned int DW_0_DATA;
|
||||
} HEADER_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int dst_addr_31_0 : 32;
|
||||
};
|
||||
unsigned int DW_1_DATA;
|
||||
} DST_ADDR_LO_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int dst_addr_63_32 : 32;
|
||||
};
|
||||
unsigned int DW_2_DATA;
|
||||
} DST_ADDR_HI_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int src_data_31_0 : 32;
|
||||
};
|
||||
unsigned int DW_3_DATA;
|
||||
} DATA_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int count : 22;
|
||||
unsigned int reserved_0 : 10;
|
||||
};
|
||||
unsigned int DW_4_DATA;
|
||||
} COUNT_UNION;
|
||||
|
||||
static const size_t kMaxSize_ = 0x3fffe0;
|
||||
} SDMA_PKT_CONSTANT_FILL;
|
||||
|
||||
typedef struct SDMA_PKT_FENCE_TAG {
|
||||
union {
|
||||
struct {
|
||||
unsigned int op : 8;
|
||||
unsigned int sub_op : 8;
|
||||
unsigned int mtype : 3;
|
||||
unsigned int gcc : 1;
|
||||
unsigned int sys : 1;
|
||||
unsigned int pad1 : 1;
|
||||
unsigned int snp : 1;
|
||||
unsigned int gpa : 1;
|
||||
unsigned int l2_policy : 2;
|
||||
unsigned int reserved_0 : 6;
|
||||
};
|
||||
unsigned int DW_0_DATA;
|
||||
} HEADER_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int addr_31_0 : 32;
|
||||
};
|
||||
unsigned int DW_1_DATA;
|
||||
} ADDR_LO_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int addr_63_32 : 32;
|
||||
};
|
||||
unsigned int DW_2_DATA;
|
||||
} ADDR_HI_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int data : 32;
|
||||
};
|
||||
unsigned int DW_3_DATA;
|
||||
} DATA_UNION;
|
||||
} SDMA_PKT_FENCE;
|
||||
|
||||
typedef struct SDMA_PKT_POLL_REGMEM_TAG {
|
||||
union {
|
||||
struct {
|
||||
unsigned int op : 8;
|
||||
unsigned int sub_op : 8;
|
||||
unsigned int reserved_0 : 10;
|
||||
unsigned int hdp_flush : 1;
|
||||
unsigned int reserved_1 : 1;
|
||||
unsigned int func : 3;
|
||||
unsigned int mem_poll : 1;
|
||||
};
|
||||
unsigned int DW_0_DATA;
|
||||
} HEADER_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int addr_31_0 : 32;
|
||||
};
|
||||
unsigned int DW_1_DATA;
|
||||
} ADDR_LO_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int addr_63_32 : 32;
|
||||
};
|
||||
unsigned int DW_2_DATA;
|
||||
} ADDR_HI_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int value : 32;
|
||||
};
|
||||
unsigned int DW_3_DATA;
|
||||
} VALUE_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int mask : 32;
|
||||
};
|
||||
unsigned int DW_4_DATA;
|
||||
} MASK_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int interval : 16;
|
||||
unsigned int retry_count : 12;
|
||||
unsigned int reserved_0 : 4;
|
||||
};
|
||||
unsigned int DW_5_DATA;
|
||||
} DW5_UNION;
|
||||
} SDMA_PKT_POLL_REGMEM;
|
||||
|
||||
typedef struct SDMA_PKT_ATOMIC_TAG {
|
||||
union {
|
||||
struct {
|
||||
unsigned int op : 8;
|
||||
unsigned int sub_op : 8;
|
||||
unsigned int l : 1;
|
||||
unsigned int reserved_0 : 8;
|
||||
unsigned int operation : 7;
|
||||
};
|
||||
unsigned int DW_0_DATA;
|
||||
} HEADER_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int addr_31_0 : 32;
|
||||
};
|
||||
unsigned int DW_1_DATA;
|
||||
} ADDR_LO_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int addr_63_32 : 32;
|
||||
};
|
||||
unsigned int DW_2_DATA;
|
||||
} ADDR_HI_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int src_data_31_0 : 32;
|
||||
};
|
||||
unsigned int DW_3_DATA;
|
||||
} SRC_DATA_LO_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int src_data_63_32 : 32;
|
||||
};
|
||||
unsigned int DW_4_DATA;
|
||||
} SRC_DATA_HI_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int cmp_data_31_0 : 32;
|
||||
};
|
||||
unsigned int DW_5_DATA;
|
||||
} CMP_DATA_LO_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int cmp_data_63_32 : 32;
|
||||
};
|
||||
unsigned int DW_6_DATA;
|
||||
} CMP_DATA_HI_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int loop_interval : 13;
|
||||
unsigned int reserved_0 : 19;
|
||||
};
|
||||
unsigned int DW_7_DATA;
|
||||
} LOOP_UNION;
|
||||
} SDMA_PKT_ATOMIC;
|
||||
|
||||
typedef struct SDMA_PKT_TIMESTAMP_TAG {
|
||||
union {
|
||||
struct {
|
||||
unsigned int op : 8;
|
||||
unsigned int sub_op : 8;
|
||||
unsigned int reserved_0 : 16;
|
||||
};
|
||||
unsigned int DW_0_DATA;
|
||||
} HEADER_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int addr_31_0 : 32;
|
||||
};
|
||||
unsigned int DW_1_DATA;
|
||||
} ADDR_LO_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int addr_63_32 : 32;
|
||||
};
|
||||
unsigned int DW_2_DATA;
|
||||
} ADDR_HI_UNION;
|
||||
|
||||
} SDMA_PKT_TIMESTAMP;
|
||||
|
||||
typedef struct SDMA_PKT_TRAP_TAG {
|
||||
union {
|
||||
struct {
|
||||
unsigned int op : 8;
|
||||
unsigned int sub_op : 8;
|
||||
unsigned int reserved_0 : 16;
|
||||
};
|
||||
unsigned int DW_0_DATA;
|
||||
} HEADER_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int int_ctx : 28;
|
||||
unsigned int reserved_1 : 4;
|
||||
};
|
||||
unsigned int DW_1_DATA;
|
||||
} INT_CONTEXT_UNION;
|
||||
} SDMA_PKT_TRAP;
|
||||
|
||||
// HDP flush packet, no parameters.
|
||||
typedef struct SDMA_PKT_HDP_FLUSH_TAG {
|
||||
unsigned int DW_0_DATA;
|
||||
unsigned int DW_1_DATA;
|
||||
unsigned int DW_2_DATA;
|
||||
unsigned int DW_3_DATA;
|
||||
unsigned int DW_4_DATA;
|
||||
unsigned int DW_5_DATA;
|
||||
|
||||
// Version of gfx9 sDMA microcode introducing SDMA_PKT_HDP_FLUSH
|
||||
static const uint16_t kMinVersion_ = 0x1A5;
|
||||
} SDMA_PKT_HDP_FLUSH;
|
||||
static const SDMA_PKT_HDP_FLUSH hdp_flush_cmd = {0x8, 0x0, 0x80000000, 0x0, 0x0, 0x0};
|
||||
|
||||
typedef struct SDMA_PKT_GCR_TAG {
|
||||
union {
|
||||
struct {
|
||||
unsigned int op : 8;
|
||||
unsigned int sub_op : 8;
|
||||
unsigned int : 16;
|
||||
};
|
||||
unsigned int DW_0_DATA;
|
||||
} HEADER_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int : 7;
|
||||
unsigned int BaseVA_LO : 25;
|
||||
};
|
||||
unsigned int DW_1_DATA;
|
||||
} WORD1_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int BaseVA_HI : 16;
|
||||
unsigned int GCR_CONTROL_GLI_INV : 2;
|
||||
unsigned int GCR_CONTROL_GL1_RANGE : 2;
|
||||
unsigned int GCR_CONTROL_GLM_WB : 1;
|
||||
unsigned int GCR_CONTROL_GLM_INV : 1;
|
||||
unsigned int GCR_CONTROL_GLK_WB : 1;
|
||||
unsigned int GCR_CONTROL_GLK_INV : 1;
|
||||
unsigned int GCR_CONTROL_GLV_INV : 1;
|
||||
unsigned int GCR_CONTROL_GL1_INV : 1;
|
||||
unsigned int GCR_CONTROL_GL2_US : 1;
|
||||
unsigned int GCR_CONTROL_GL2_RANGE : 2;
|
||||
unsigned int GCR_CONTROL_GL2_DISCARD : 1;
|
||||
unsigned int GCR_CONTROL_GL2_INV : 1;
|
||||
unsigned int GCR_CONTROL_GL2_WB : 1;
|
||||
};
|
||||
unsigned int DW_2_DATA;
|
||||
} WORD2_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int GCR_CONTROL_RANGE_IS_PA : 1;
|
||||
unsigned int GCR_CONTROL_SEQ : 2;
|
||||
unsigned int : 4;
|
||||
unsigned int LimitVA_LO : 25;
|
||||
};
|
||||
unsigned int DW_3_DATA;
|
||||
} WORD3_UNION;
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int LimitVA_HI : 16;
|
||||
unsigned int : 8;
|
||||
unsigned int VMID : 4;
|
||||
unsigned int : 4;
|
||||
};
|
||||
unsigned int DW_4_DATA;
|
||||
} WORD4_UNION;
|
||||
} SDMA_PKT_GCR;
|
||||
|
||||
} // namespace amd
|
||||
} // namespace rocr
|
||||
|
||||
#endif // HSA_RUNTIME_CORE_INC_SDMA_REGISTERS_H_
|
||||
207
extra/hip_gpu_driver/test_kfd_2.py
Normal file
207
extra/hip_gpu_driver/test_kfd_2.py
Normal file
@@ -0,0 +1,207 @@
|
||||
import os, ctypes, pathlib, re, fcntl, functools, mmap, time
|
||||
import tinygrad.runtime.autogen.kfd as kfd
|
||||
from tinygrad.helpers import to_mv, getenv
|
||||
from extra.hip_gpu_driver import hip_ioctl
|
||||
import tinygrad.runtime.autogen.hsa as hsa
|
||||
from hexdump import hexdump
|
||||
|
||||
libc = ctypes.CDLL("libc.so.6")
|
||||
libc.memset.argtypes = [ctypes.c_void_p, ctypes.c_char, ctypes.c_int]
|
||||
libc.mmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long]
|
||||
libc.mmap.restype = ctypes.c_void_p
|
||||
MAP_NORESERVE = 0x4000
|
||||
MAP_FIXED = 0x10
|
||||
|
||||
def kfd_ioctl(idir, nr, user_struct, fd, **kwargs):
|
||||
made = user_struct(**kwargs)
|
||||
ret = fcntl.ioctl(fd, (idir<<30) | (ctypes.sizeof(user_struct)<<16) | (ord('K')<<8) | nr, made)
|
||||
if ret != 0: raise RuntimeError(f"ioctl returned {ret}")
|
||||
return made
|
||||
|
||||
def format_struct(s):
|
||||
sdats = []
|
||||
for field_name, field_type in s._fields_:
|
||||
dat = getattr(s, field_name)
|
||||
if isinstance(dat, int): sdats.append(f"{field_name}:0x{dat:X}")
|
||||
else: sdats.append(f"{field_name}:{dat}")
|
||||
return sdats
|
||||
|
||||
idirs = {"IOW": 1, "IOR": 2, "IOWR": 3}
|
||||
def ioctls_from_header():
|
||||
hdr = pathlib.Path("/usr/include/linux/kfd_ioctl.h").read_text().replace("\\\n", "")
|
||||
pattern = r'#define\s+(AMDKFD_IOC_[A-Z0-9_]+)\s+AMDKFD_(IOW?R?)\((0x[0-9a-fA-F]+),\s+struct\s([A-Za-z0-9_]+)\)'
|
||||
matches = re.findall(pattern, hdr, re.MULTILINE)
|
||||
|
||||
fxns = {}
|
||||
for name, idir, nr, sname in matches:
|
||||
fxns[name.replace("AMDKFD_IOC_", "").lower()] = functools.partial(kfd_ioctl, idirs[idir], int(nr, 0x10), getattr(kfd, "struct_"+sname))
|
||||
return type("KIO", (object, ), fxns)
|
||||
kio = ioctls_from_header()
|
||||
|
||||
# sudo su -c "echo 'file drivers/gpu/drm/amd/* +p' > /sys/kernel/debug/dynamic_debug/control"
|
||||
|
||||
def gpu_alloc_userptr(fd, size, flags):
|
||||
addr = libc.mmap(0, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED|mmap.MAP_ANONYMOUS, -1, 0)
|
||||
assert addr != 0xffffffffffffffff
|
||||
mem = kio.alloc_memory_of_gpu(fd, va_addr=addr, size=size, gpu_id=GPU_ID, flags=flags, mmap_offset=addr)
|
||||
return mem
|
||||
|
||||
def gpu_alloc(fd, size, flags):
|
||||
addr = libc.mmap(0, size, 0, mmap.MAP_PRIVATE|mmap.MAP_ANONYMOUS|MAP_NORESERVE, -1, 0)
|
||||
assert addr != 0xffffffffffffffff
|
||||
mem = kio.alloc_memory_of_gpu(fd, va_addr=addr, size=size, gpu_id=GPU_ID, flags=flags)
|
||||
buf = libc.mmap(mem.va_addr, mem.size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED|MAP_FIXED, drm_fd, mem.mmap_offset)
|
||||
assert buf != 0xffffffffffffffff
|
||||
assert addr == buf == mem.va_addr
|
||||
return mem
|
||||
|
||||
if __name__ == "__main__":
|
||||
fd = os.open("/dev/kfd", os.O_RDWR)
|
||||
gpu_num = getenv("GPU", 0)
|
||||
drm_fd = os.open(f"/dev/dri/renderD{128+gpu_num}", os.O_RDWR)
|
||||
with open(f"/sys/devices/virtual/kfd/kfd/topology/nodes/{1+gpu_num}/gpu_id", "r") as f: GPU_ID = int(f.read())
|
||||
|
||||
#ver = kio.get_version(fd)
|
||||
st = kio.acquire_vm(fd, drm_fd=drm_fd, gpu_id=GPU_ID)
|
||||
#exit(0)
|
||||
|
||||
# 0xF0000001 = KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC | KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE
|
||||
# 0xD6000002 = KFD_IOC_ALLOC_MEM_FLAGS_GTT | KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE
|
||||
# 0xD6000004 = KFD_IOC_ALLOC_MEM_FLAGS_USERPTR | KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE
|
||||
# 0x94000010 = KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP | KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE
|
||||
#addr = libc.mmap(0, 0x1000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_PRIVATE|mmap.MAP_ANONYMOUS, -1, 0)
|
||||
#addr = libc.mmap(0, 0x1000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED|mmap.MAP_ANONYMOUS, -1, 0)
|
||||
#mem = kio.AMDKFD_IOC_ALLOC_MEMORY_OF_GPU(fd, va_addr=addr, size=0x1000, gpu_id=GPU_ID, flags=0xD6000004)
|
||||
|
||||
#mem = gpu_alloc(fd, 0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM |
|
||||
# kfd.KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE |
|
||||
# kfd.KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC | kfd.KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE)
|
||||
#arr = (ctypes.c_int32 * 1)(GPU_ID)
|
||||
#stm = kio.map_memory_to_gpu(fd, handle=mem.handle, device_ids_array_ptr=ctypes.addressof(arr), n_devices=1)
|
||||
|
||||
arr = (ctypes.c_int32 * 1)(GPU_ID)
|
||||
rw_ptr = gpu_alloc(fd, 0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT | kfd.KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE |
|
||||
kfd.KFD_IOC_ALLOC_MEM_FLAGS_COHERENT | kfd.KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED |
|
||||
kfd.KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE)
|
||||
stm = kio.map_memory_to_gpu(fd, handle=rw_ptr.handle, device_ids_array_ptr=ctypes.addressof(arr), n_devices=1)
|
||||
assert stm.n_success == 1
|
||||
event_page = gpu_alloc(fd, 0x8000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT | kfd.KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE |
|
||||
kfd.KFD_IOC_ALLOC_MEM_FLAGS_COHERENT | kfd.KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED |
|
||||
kfd.KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE)
|
||||
stm = kio.map_memory_to_gpu(fd, handle=event_page.handle, device_ids_array_ptr=ctypes.addressof(arr), n_devices=1)
|
||||
assert stm.n_success == 1
|
||||
ring_base = gpu_alloc_userptr(fd, 0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR | kfd.KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE |
|
||||
kfd.KFD_IOC_ALLOC_MEM_FLAGS_COHERENT | kfd.KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED |
|
||||
kfd.KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE)
|
||||
stm = kio.map_memory_to_gpu(fd, handle=ring_base.handle, device_ids_array_ptr=ctypes.addressof(arr), n_devices=1)
|
||||
assert stm.n_success == 1
|
||||
signals = gpu_alloc_userptr(fd, 0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR | kfd.KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE |
|
||||
kfd.KFD_IOC_ALLOC_MEM_FLAGS_COHERENT | kfd.KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED |
|
||||
kfd.KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE)
|
||||
stm = kio.map_memory_to_gpu(fd, handle=signals.handle, device_ids_array_ptr=ctypes.addressof(arr), n_devices=1)
|
||||
assert stm.n_success == 1
|
||||
eop_buffer = gpu_alloc(fd, 0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM |
|
||||
kfd.KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE |
|
||||
kfd.KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE)
|
||||
stm = kio.map_memory_to_gpu(fd, handle=eop_buffer.handle, device_ids_array_ptr=ctypes.addressof(arr), n_devices=1)
|
||||
assert stm.n_success == 1
|
||||
ctx_save_restore_address = gpu_alloc(fd, 0x2C02000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM |
|
||||
kfd.KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE |
|
||||
kfd.KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE)
|
||||
stm = kio.map_memory_to_gpu(fd, handle=ctx_save_restore_address.handle, device_ids_array_ptr=ctypes.addressof(arr), n_devices=1)
|
||||
assert stm.n_success == 1
|
||||
|
||||
#113.00 ms + 0.00 ms : 0 = AMDKFD_IOC_CREATE_QUEUE ring_base_address:0x797465200000 write_pointer_address:0x79751C068038 read_pointer_address:0x79751C068080 doorbell_offset:0x0 ring_size:0x800000 gpu_id:0x433D queue_type:0x2 queue_per
|
||||
#centage:0x64 queue_priority:0x7 queue_id:0x0 eop_buffer_address:0x79751C064000 eop_buffer_size:0x1000 ctx_save_restore_address:0x796E52400000 ctx_save_restore_size:0x2BEA000 ctl_stack_size:0xA000
|
||||
|
||||
#113.84 ms + 0.59 ms : 0 = AMDKFD_IOC_CREATE_QUEUE ring_base_address:0x71AC3F600000 write_pointer_address:0x71B302AB0038 read_pointer_address:0x71B302AB0080 doorbell_offset:0xD0CF400000000008 ring_size:0x800000 gpu_id:0x433D queue_typ
|
||||
#e:0x2 queue_percentage:0x64 queue_priority:0x7 queue_id:0x1 eop_buffer_address:0x71B302AAC000 eop_buffer_size:0x1000 ctx_save_restore_address:0x71AC3C800000 ctx_save_restore_size:0x2BEA000 ctl_stack_size:0xA000
|
||||
|
||||
#define KFD_MMAP_TYPE_SHIFT 62
|
||||
#define KFD_MMAP_TYPE_DOORBELL (0x3ULL << KFD_MMAP_TYPE_SHIFT)
|
||||
evt = kio.create_event(fd, event_page_offset=event_page.handle, auto_reset=1)
|
||||
|
||||
nq = kio.create_queue(fd, ring_base_address=ring_base.va_addr, ring_size=0x1000, gpu_id=GPU_ID,
|
||||
queue_type=kfd.KFD_IOC_QUEUE_TYPE_COMPUTE_AQL, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE,
|
||||
queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
|
||||
eop_buffer_address=eop_buffer.va_addr, eop_buffer_size=0x1000,
|
||||
ctx_save_restore_address=ctx_save_restore_address.va_addr, ctx_save_restore_size=0x2C02000,
|
||||
ctl_stack_size = 0xa000,
|
||||
# write_pointer_address and read_pointer_address are on GART
|
||||
#write_pointer_address=0xaaaabbbb, read_pointer_address=0xaaaacccc)
|
||||
write_pointer_address=rw_ptr.va_addr+0, read_pointer_address=rw_ptr.va_addr+0x8)
|
||||
doorbell = libc.mmap(0, 8192, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, fd, nq.doorbell_offset)
|
||||
print("doorbell", hex(doorbell))
|
||||
|
||||
to_mv(signals.va_addr, 0x40)
|
||||
|
||||
"""
|
||||
hexdump(to_mv(event_page.va_addr, 0x40))
|
||||
kio.set_event(fd, event_id=evt.event_id)
|
||||
hexdump(to_mv(event_page.va_addr, 0x40))
|
||||
kio.reset_event(fd, event_id=evt.event_id)
|
||||
hexdump(to_mv(event_page.va_addr, 0x40))
|
||||
"""
|
||||
|
||||
# KFD_EVENT_TYPE_SIGNAL
|
||||
|
||||
BARRIER_HEADER = 1 << hsa.HSA_PACKET_HEADER_BARRIER
|
||||
BARRIER_HEADER |= hsa.HSA_FENCE_SCOPE_SYSTEM << hsa.HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE
|
||||
BARRIER_HEADER |= hsa.HSA_FENCE_SCOPE_SYSTEM << hsa.HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE
|
||||
BARRIER_HEADER |= hsa.HSA_PACKET_TYPE_BARRIER_AND << hsa.HSA_PACKET_HEADER_TYPE
|
||||
|
||||
AQL_PACKET_SIZE = ctypes.sizeof(hsa.hsa_kernel_dispatch_packet_t)
|
||||
EMPTY_SIGNAL = hsa.hsa_signal_t()
|
||||
|
||||
ds = to_mv(rw_ptr.va_addr, 0x100).cast("Q")
|
||||
ds[0] = 1 #ring_base.va_addr + AQL_PACKET_SIZE
|
||||
ds[1] = 0 #ring_base.va_addr
|
||||
#libc.memset(rw_ptr.va_addr, 0xaa, 0x100)
|
||||
#hexdump(to_mv(rw_ptr.va_addr, 0x100))
|
||||
|
||||
#packet = hsa.hsa_barrier_and_packet_t.from_address(rw_ptr.va_addr+0x38)
|
||||
packet = hsa.hsa_barrier_and_packet_t.from_address(ring_base.va_addr)
|
||||
packet.reserved0 = 0
|
||||
packet.reserved1 = 0
|
||||
for i in range(5): packet.dep_signal[i] = EMPTY_SIGNAL
|
||||
#packet.dep_signal[0] = hsa.hsa_signal_t(evt.event_id)
|
||||
packet.reserved2 = 0
|
||||
#packet.completion_signal = EMPTY_SIGNAL
|
||||
packet.completion_signal = hsa.hsa_signal_t(signals.va_addr)
|
||||
packet.header = BARRIER_HEADER
|
||||
hexdump(to_mv(ring_base.va_addr, AQL_PACKET_SIZE))
|
||||
|
||||
# _HsaEventData
|
||||
to_mv(signals.va_addr, 0x40).cast("Q")[0] = 1
|
||||
to_mv(signals.va_addr, 0x40).cast("Q")[1] = 1
|
||||
#to_mv(signals.va_addr, 0x40).cast("Q")[2] = event_page
|
||||
to_mv(signals.va_addr, 0x40).cast("Q")[2] = event_page.va_addr + evt.event_slot_index*8 # HWData2=HWAddress
|
||||
to_mv(signals.va_addr, 0x40).cast("Q")[3] = evt.event_trigger_data # HWData3=HWData
|
||||
print(hex(ds[0]), hex(ds[1]), hex(ds[2]))
|
||||
hexdump(to_mv(signals.va_addr, 0x40))
|
||||
|
||||
# 10 08 49 3E 46 77 00 00
|
||||
|
||||
|
||||
# ring doorbell
|
||||
print(hex(to_mv(doorbell, 0x10).cast("I")[0]))
|
||||
#to_mv(doorbell, 0x10).cast("I")[0] = 0xffffffff
|
||||
to_mv(doorbell, 0x10).cast("I")[0] = 0
|
||||
|
||||
evt_arr = (kfd.struct_kfd_event_data * 1)()
|
||||
evt_arr[0].event_id = evt.event_id
|
||||
kio.wait_events(fd, events_ptr=ctypes.addressof(evt_arr), num_events=1, wait_for_all=0, timeout=1000)
|
||||
|
||||
print(hex(ds[0]), hex(ds[1]), hex(ds[2]))
|
||||
hexdump(to_mv(signals.va_addr, 0x40))
|
||||
|
||||
#nq = kio.create_queue(fd, ring_base_address=buf, ring_size=0x1000, gpu_id=GPU_ID,
|
||||
# queue_type=kfd.KFD_IOC_QUEUE_TYPE_COMPUTE_AQL, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE,
|
||||
# queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY, write_pointer_address=buf+8, read_pointer_address=buf+0x10)
|
||||
#print(nq)
|
||||
|
||||
#mv = to_mv(buf, 0x1000)
|
||||
#addr = libc.mmap(0, 0x1000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_PRIVATE|mmap.MAP_ANONYMOUS, -1, 0)
|
||||
|
||||
#print('\n'.join(format_struct(ver)))
|
||||
#print('\n'.join(format_struct(st)))
|
||||
1430
tinygrad/runtime/autogen/amd_sdma.py
Normal file
1430
tinygrad/runtime/autogen/amd_sdma.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -3588,6 +3588,128 @@ try:
|
||||
hsa_amd_vmem_get_alloc_properties_from_handle.argtypes = [hsa_amd_vmem_alloc_handle_t, ctypes.POINTER(struct_hsa_amd_memory_pool_s), ctypes.POINTER(c__EA_hsa_amd_memory_type_t)]
|
||||
except AttributeError:
|
||||
pass
|
||||
amd_queue_properties32_t = ctypes.c_uint32
|
||||
|
||||
# values for enumeration 'amd_queue_properties_t'
|
||||
amd_queue_properties_t__enumvalues = {
|
||||
0: 'AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_SHIFT',
|
||||
1: 'AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_WIDTH',
|
||||
1: 'AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER',
|
||||
1: 'AMD_QUEUE_PROPERTIES_IS_PTR64_SHIFT',
|
||||
1: 'AMD_QUEUE_PROPERTIES_IS_PTR64_WIDTH',
|
||||
2: 'AMD_QUEUE_PROPERTIES_IS_PTR64',
|
||||
2: 'AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_DEBUG_SGPRS_SHIFT',
|
||||
1: 'AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_DEBUG_SGPRS_WIDTH',
|
||||
4: 'AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_DEBUG_SGPRS',
|
||||
3: 'AMD_QUEUE_PROPERTIES_ENABLE_PROFILING_SHIFT',
|
||||
1: 'AMD_QUEUE_PROPERTIES_ENABLE_PROFILING_WIDTH',
|
||||
8: 'AMD_QUEUE_PROPERTIES_ENABLE_PROFILING',
|
||||
4: 'AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE_SHIFT',
|
||||
1: 'AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE_WIDTH',
|
||||
16: 'AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE',
|
||||
5: 'AMD_QUEUE_PROPERTIES_RESERVED1_SHIFT',
|
||||
27: 'AMD_QUEUE_PROPERTIES_RESERVED1_WIDTH',
|
||||
-32: 'AMD_QUEUE_PROPERTIES_RESERVED1',
|
||||
}
|
||||
AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_SHIFT = 0
|
||||
AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_WIDTH = 1
|
||||
AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER = 1
|
||||
AMD_QUEUE_PROPERTIES_IS_PTR64_SHIFT = 1
|
||||
AMD_QUEUE_PROPERTIES_IS_PTR64_WIDTH = 1
|
||||
AMD_QUEUE_PROPERTIES_IS_PTR64 = 2
|
||||
AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_DEBUG_SGPRS_SHIFT = 2
|
||||
AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_DEBUG_SGPRS_WIDTH = 1
|
||||
AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_DEBUG_SGPRS = 4
|
||||
AMD_QUEUE_PROPERTIES_ENABLE_PROFILING_SHIFT = 3
|
||||
AMD_QUEUE_PROPERTIES_ENABLE_PROFILING_WIDTH = 1
|
||||
AMD_QUEUE_PROPERTIES_ENABLE_PROFILING = 8
|
||||
AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE_SHIFT = 4
|
||||
AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE_WIDTH = 1
|
||||
AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE = 16
|
||||
AMD_QUEUE_PROPERTIES_RESERVED1_SHIFT = 5
|
||||
AMD_QUEUE_PROPERTIES_RESERVED1_WIDTH = 27
|
||||
AMD_QUEUE_PROPERTIES_RESERVED1 = -32
|
||||
amd_queue_properties_t = ctypes.c_int32 # enum
|
||||
class struct_amd_queue_s(Structure):
|
||||
pass
|
||||
|
||||
struct_amd_queue_s._pack_ = 1 # source:False
|
||||
struct_amd_queue_s._fields_ = [
|
||||
('hsa_queue', hsa_queue_t),
|
||||
('reserved1', ctypes.c_uint32 * 4),
|
||||
('write_dispatch_id', ctypes.c_uint64),
|
||||
('group_segment_aperture_base_hi', ctypes.c_uint32),
|
||||
('private_segment_aperture_base_hi', ctypes.c_uint32),
|
||||
('max_cu_id', ctypes.c_uint32),
|
||||
('max_wave_id', ctypes.c_uint32),
|
||||
('max_legacy_doorbell_dispatch_id_plus_1', ctypes.c_uint64),
|
||||
('legacy_doorbell_lock', ctypes.c_uint32),
|
||||
('reserved2', ctypes.c_uint32 * 9),
|
||||
('read_dispatch_id', ctypes.c_uint64),
|
||||
('read_dispatch_id_field_base_byte_offset', ctypes.c_uint32),
|
||||
('compute_tmpring_size', ctypes.c_uint32),
|
||||
('scratch_resource_descriptor', ctypes.c_uint32 * 4),
|
||||
('scratch_backing_memory_location', ctypes.c_uint64),
|
||||
('scratch_backing_memory_byte_size', ctypes.c_uint64),
|
||||
('scratch_wave64_lane_byte_size', ctypes.c_uint32),
|
||||
('queue_properties', ctypes.c_uint32),
|
||||
('reserved3', ctypes.c_uint32 * 2),
|
||||
('queue_inactive_signal', hsa_signal_t),
|
||||
('reserved4', ctypes.c_uint32 * 14),
|
||||
]
|
||||
|
||||
amd_queue_t = struct_amd_queue_s
|
||||
amd_signal_kind64_t = ctypes.c_int64
|
||||
|
||||
# values for enumeration 'amd_signal_kind_t'
|
||||
amd_signal_kind_t__enumvalues = {
|
||||
0: 'AMD_SIGNAL_KIND_INVALID',
|
||||
1: 'AMD_SIGNAL_KIND_USER',
|
||||
-1: 'AMD_SIGNAL_KIND_DOORBELL',
|
||||
-2: 'AMD_SIGNAL_KIND_LEGACY_DOORBELL',
|
||||
}
|
||||
AMD_SIGNAL_KIND_INVALID = 0
|
||||
AMD_SIGNAL_KIND_USER = 1
|
||||
AMD_SIGNAL_KIND_DOORBELL = -1
|
||||
AMD_SIGNAL_KIND_LEGACY_DOORBELL = -2
|
||||
amd_signal_kind_t = ctypes.c_int32 # enum
|
||||
class struct_amd_signal_s(Structure):
|
||||
pass
|
||||
|
||||
class union_amd_signal_s_0(Union):
|
||||
pass
|
||||
|
||||
union_amd_signal_s_0._pack_ = 1 # source:False
|
||||
union_amd_signal_s_0._fields_ = [
|
||||
('value', ctypes.c_int64),
|
||||
('legacy_hardware_doorbell_ptr', ctypes.POINTER(ctypes.c_uint32)),
|
||||
('hardware_doorbell_ptr', ctypes.POINTER(ctypes.c_uint64)),
|
||||
]
|
||||
|
||||
class union_amd_signal_s_1(Union):
|
||||
pass
|
||||
|
||||
union_amd_signal_s_1._pack_ = 1 # source:False
|
||||
union_amd_signal_s_1._fields_ = [
|
||||
('queue_ptr', ctypes.POINTER(struct_amd_queue_s)),
|
||||
('reserved2', ctypes.c_uint64),
|
||||
]
|
||||
|
||||
struct_amd_signal_s._pack_ = 1 # source:False
|
||||
struct_amd_signal_s._anonymous_ = ('_0', '_1',)
|
||||
struct_amd_signal_s._fields_ = [
|
||||
('kind', ctypes.c_int64),
|
||||
('_0', union_amd_signal_s_0),
|
||||
('event_mailbox_ptr', ctypes.c_uint64),
|
||||
('event_id', ctypes.c_uint32),
|
||||
('reserved1', ctypes.c_uint32),
|
||||
('start_ts', ctypes.c_uint64),
|
||||
('end_ts', ctypes.c_uint64),
|
||||
('_1', union_amd_signal_s_1),
|
||||
('reserved3', ctypes.c_uint32 * 2),
|
||||
]
|
||||
|
||||
amd_signal_t = struct_amd_signal_s
|
||||
class struct_BrigModuleHeader(Structure):
|
||||
pass
|
||||
|
||||
@@ -3713,7 +3835,27 @@ struct_hsa_ext_finalizer_1_00_pfn_s._fields_ = [
|
||||
|
||||
hsa_ext_finalizer_1_00_pfn_t = struct_hsa_ext_finalizer_1_00_pfn_s
|
||||
__all__ = \
|
||||
['BrigModule_t', 'HSA_ACCESS_PERMISSION_NONE',
|
||||
['AMD_QUEUE_PROPERTIES_ENABLE_PROFILING',
|
||||
'AMD_QUEUE_PROPERTIES_ENABLE_PROFILING_SHIFT',
|
||||
'AMD_QUEUE_PROPERTIES_ENABLE_PROFILING_WIDTH',
|
||||
'AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER',
|
||||
'AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_DEBUG_SGPRS',
|
||||
'AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_DEBUG_SGPRS_SHIFT',
|
||||
'AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_DEBUG_SGPRS_WIDTH',
|
||||
'AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_SHIFT',
|
||||
'AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_WIDTH',
|
||||
'AMD_QUEUE_PROPERTIES_IS_PTR64',
|
||||
'AMD_QUEUE_PROPERTIES_IS_PTR64_SHIFT',
|
||||
'AMD_QUEUE_PROPERTIES_IS_PTR64_WIDTH',
|
||||
'AMD_QUEUE_PROPERTIES_RESERVED1',
|
||||
'AMD_QUEUE_PROPERTIES_RESERVED1_SHIFT',
|
||||
'AMD_QUEUE_PROPERTIES_RESERVED1_WIDTH',
|
||||
'AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE',
|
||||
'AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE_SHIFT',
|
||||
'AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE_WIDTH',
|
||||
'AMD_SIGNAL_KIND_DOORBELL', 'AMD_SIGNAL_KIND_INVALID',
|
||||
'AMD_SIGNAL_KIND_LEGACY_DOORBELL', 'AMD_SIGNAL_KIND_USER',
|
||||
'BrigModule_t', 'HSA_ACCESS_PERMISSION_NONE',
|
||||
'HSA_ACCESS_PERMISSION_RO', 'HSA_ACCESS_PERMISSION_RW',
|
||||
'HSA_ACCESS_PERMISSION_WO', 'HSA_AGENT_FEATURE_AGENT_DISPATCH',
|
||||
'HSA_AGENT_FEATURE_KERNEL_DISPATCH',
|
||||
@@ -4082,8 +4224,10 @@ __all__ = \
|
||||
'HSA_VARIABLE_SEGMENT_READONLY', 'HSA_WAIT_STATE_ACTIVE',
|
||||
'HSA_WAIT_STATE_BLOCKED', 'HSA_WAVEFRONT_INFO_SIZE',
|
||||
'MEMORY_TYPE_NONE', 'MEMORY_TYPE_PINNED',
|
||||
'c__EA_hsa_access_permission_t', 'c__EA_hsa_agent_feature_t',
|
||||
'c__EA_hsa_agent_info_t',
|
||||
'amd_queue_properties32_t', 'amd_queue_properties_t',
|
||||
'amd_queue_t', 'amd_signal_kind64_t', 'amd_signal_kind_t',
|
||||
'amd_signal_t', 'c__EA_hsa_access_permission_t',
|
||||
'c__EA_hsa_agent_feature_t', 'c__EA_hsa_agent_info_t',
|
||||
'c__EA_hsa_amd_agent_memory_pool_info_t',
|
||||
'c__EA_hsa_amd_copy_direction_t',
|
||||
'c__EA_hsa_amd_hw_exception_reset_cause_t',
|
||||
@@ -4422,6 +4566,7 @@ __all__ = \
|
||||
'hsa_wait_state_t__enumvalues', 'hsa_wavefront_get_info',
|
||||
'hsa_wavefront_info_t', 'hsa_wavefront_info_t__enumvalues',
|
||||
'hsa_wavefront_t', 'int32_t', 'size_t', 'struct_BrigModuleHeader',
|
||||
'struct_amd_queue_s', 'struct_amd_signal_s',
|
||||
'struct_hsa_agent_dispatch_packet_s', 'struct_hsa_agent_s',
|
||||
'struct_hsa_amd_barrier_value_packet_s', 'struct_hsa_amd_event_s',
|
||||
'struct_hsa_amd_gpu_hw_exception_info_s',
|
||||
@@ -4455,4 +4600,5 @@ __all__ = \
|
||||
'struct_hsa_queue_s', 'struct_hsa_region_s',
|
||||
'struct_hsa_signal_group_s', 'struct_hsa_signal_s',
|
||||
'struct_hsa_wavefront_s', 'uint16_t', 'uint32_t', 'uint64_t',
|
||||
'union_amd_signal_s_0', 'union_amd_signal_s_1',
|
||||
'union_hsa_amd_event_s_0']
|
||||
|
||||
812
tinygrad/runtime/autogen/kfd.py
Normal file
812
tinygrad/runtime/autogen/kfd.py
Normal file
@@ -0,0 +1,812 @@
|
||||
# mypy: ignore-errors
|
||||
# -*- coding: utf-8 -*-
|
||||
#
|
||||
# TARGET arch is: []
|
||||
# WORD_SIZE is: 8
|
||||
# POINTER_SIZE is: 8
|
||||
# LONGDOUBLE_SIZE is: 16
|
||||
#
|
||||
import ctypes, os
|
||||
|
||||
|
||||
class AsDictMixin:
|
||||
@classmethod
|
||||
def as_dict(cls, self):
|
||||
result = {}
|
||||
if not isinstance(self, AsDictMixin):
|
||||
# not a structure, assume it's already a python object
|
||||
return self
|
||||
if not hasattr(cls, "_fields_"):
|
||||
return result
|
||||
# sys.version_info >= (3, 5)
|
||||
# for (field, *_) in cls._fields_: # noqa
|
||||
for field_tuple in cls._fields_: # noqa
|
||||
field = field_tuple[0]
|
||||
if field.startswith('PADDING_'):
|
||||
continue
|
||||
value = getattr(self, field)
|
||||
type_ = type(value)
|
||||
if hasattr(value, "_length_") and hasattr(value, "_type_"):
|
||||
# array
|
||||
if not hasattr(type_, "as_dict"):
|
||||
value = [v for v in value]
|
||||
else:
|
||||
type_ = type_._type_
|
||||
value = [type_.as_dict(v) for v in value]
|
||||
elif hasattr(value, "contents") and hasattr(value, "_type_"):
|
||||
# pointer
|
||||
try:
|
||||
if not hasattr(type_, "as_dict"):
|
||||
value = value.contents
|
||||
else:
|
||||
type_ = type_._type_
|
||||
value = type_.as_dict(value.contents)
|
||||
except ValueError:
|
||||
# nullptr
|
||||
value = None
|
||||
elif isinstance(value, AsDictMixin):
|
||||
# other structure
|
||||
value = type_.as_dict(value)
|
||||
result[field] = value
|
||||
return result
|
||||
|
||||
|
||||
class Structure(ctypes.Structure, AsDictMixin):
|
||||
|
||||
def __init__(self, *args, **kwds):
|
||||
# We don't want to use positional arguments fill PADDING_* fields
|
||||
|
||||
args = dict(zip(self.__class__._field_names_(), args))
|
||||
args.update(kwds)
|
||||
super(Structure, self).__init__(**args)
|
||||
|
||||
@classmethod
|
||||
def _field_names_(cls):
|
||||
if hasattr(cls, '_fields_'):
|
||||
return (f[0] for f in cls._fields_ if not f[0].startswith('PADDING'))
|
||||
else:
|
||||
return ()
|
||||
|
||||
@classmethod
|
||||
def get_type(cls, field):
|
||||
for f in cls._fields_:
|
||||
if f[0] == field:
|
||||
return f[1]
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def bind(cls, bound_fields):
|
||||
fields = {}
|
||||
for name, type_ in cls._fields_:
|
||||
if hasattr(type_, "restype"):
|
||||
if name in bound_fields:
|
||||
if bound_fields[name] is None:
|
||||
fields[name] = type_()
|
||||
else:
|
||||
# use a closure to capture the callback from the loop scope
|
||||
fields[name] = (
|
||||
type_((lambda callback: lambda *args: callback(*args))(
|
||||
bound_fields[name]))
|
||||
)
|
||||
del bound_fields[name]
|
||||
else:
|
||||
# default callback implementation (does nothing)
|
||||
try:
|
||||
default_ = type_(0).restype().value
|
||||
except TypeError:
|
||||
default_ = None
|
||||
fields[name] = type_((
|
||||
lambda default_: lambda *args: default_)(default_))
|
||||
else:
|
||||
# not a callback function, use default initialization
|
||||
if name in bound_fields:
|
||||
fields[name] = bound_fields[name]
|
||||
del bound_fields[name]
|
||||
else:
|
||||
fields[name] = type_()
|
||||
if len(bound_fields) != 0:
|
||||
raise ValueError(
|
||||
"Cannot bind the following unknown callback(s) {}.{}".format(
|
||||
cls.__name__, bound_fields.keys()
|
||||
))
|
||||
return cls(**fields)
|
||||
|
||||
|
||||
class Union(ctypes.Union, AsDictMixin):
|
||||
pass
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
KFD_IOCTL_H_INCLUDED = True # macro
|
||||
KFD_IOCTL_MAJOR_VERSION = 1 # macro
|
||||
KFD_IOCTL_MINOR_VERSION = 6 # macro
|
||||
KFD_IOC_QUEUE_TYPE_COMPUTE = 0x0 # macro
|
||||
KFD_IOC_QUEUE_TYPE_SDMA = 0x1 # macro
|
||||
KFD_IOC_QUEUE_TYPE_COMPUTE_AQL = 0x2 # macro
|
||||
KFD_IOC_QUEUE_TYPE_SDMA_XGMI = 0x3 # macro
|
||||
KFD_MAX_QUEUE_PERCENTAGE = 100 # macro
|
||||
KFD_MAX_QUEUE_PRIORITY = 15 # macro
|
||||
KFD_IOC_CACHE_POLICY_COHERENT = 0 # macro
|
||||
KFD_IOC_CACHE_POLICY_NONCOHERENT = 1 # macro
|
||||
NUM_OF_SUPPORTED_GPUS = 7 # macro
|
||||
MAX_ALLOWED_NUM_POINTS = 100 # macro
|
||||
MAX_ALLOWED_AW_BUFF_SIZE = 4096 # macro
|
||||
MAX_ALLOWED_WAC_BUFF_SIZE = 128 # macro
|
||||
KFD_IOC_EVENT_SIGNAL = 0 # macro
|
||||
KFD_IOC_EVENT_NODECHANGE = 1 # macro
|
||||
KFD_IOC_EVENT_DEVICESTATECHANGE = 2 # macro
|
||||
KFD_IOC_EVENT_HW_EXCEPTION = 3 # macro
|
||||
KFD_IOC_EVENT_SYSTEM_EVENT = 4 # macro
|
||||
KFD_IOC_EVENT_DEBUG_EVENT = 5 # macro
|
||||
KFD_IOC_EVENT_PROFILE_EVENT = 6 # macro
|
||||
KFD_IOC_EVENT_QUEUE_EVENT = 7 # macro
|
||||
KFD_IOC_EVENT_MEMORY = 8 # macro
|
||||
KFD_IOC_WAIT_RESULT_COMPLETE = 0 # macro
|
||||
KFD_IOC_WAIT_RESULT_TIMEOUT = 1 # macro
|
||||
KFD_IOC_WAIT_RESULT_FAIL = 2 # macro
|
||||
KFD_SIGNAL_EVENT_LIMIT = 4096 # macro
|
||||
KFD_HW_EXCEPTION_WHOLE_GPU_RESET = 0 # macro
|
||||
KFD_HW_EXCEPTION_PER_ENGINE_RESET = 1 # macro
|
||||
KFD_HW_EXCEPTION_GPU_HANG = 0 # macro
|
||||
KFD_HW_EXCEPTION_ECC = 1 # macro
|
||||
KFD_MEM_ERR_NO_RAS = 0 # macro
|
||||
KFD_MEM_ERR_SRAM_ECC = 1 # macro
|
||||
KFD_MEM_ERR_POISON_CONSUMED = 2 # macro
|
||||
KFD_MEM_ERR_GPU_HANG = 3 # macro
|
||||
KFD_IOC_ALLOC_MEM_FLAGS_VRAM = (1<<0) # macro
|
||||
KFD_IOC_ALLOC_MEM_FLAGS_GTT = (1<<1) # macro
|
||||
KFD_IOC_ALLOC_MEM_FLAGS_USERPTR = (1<<2) # macro
|
||||
KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL = (1<<3) # macro
|
||||
KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP = (1<<4) # macro
|
||||
KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE = (1<<31) # macro
|
||||
KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE = (1<<30) # macro
|
||||
KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC = (1<<29) # macro
|
||||
KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE = (1<<28) # macro
|
||||
KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM = (1<<27) # macro
|
||||
KFD_IOC_ALLOC_MEM_FLAGS_COHERENT = (1<<26) # macro
|
||||
KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED = (1<<25) # macro
|
||||
# def KFD_SMI_EVENT_MASK_FROM_INDEX(i): # macro
|
||||
# return (1<<((i)-1))
|
||||
KFD_IOCTL_SVM_FLAG_HOST_ACCESS = 0x00000001 # macro
|
||||
KFD_IOCTL_SVM_FLAG_COHERENT = 0x00000002 # macro
|
||||
KFD_IOCTL_SVM_FLAG_HIVE_LOCAL = 0x00000004 # macro
|
||||
KFD_IOCTL_SVM_FLAG_GPU_RO = 0x00000008 # macro
|
||||
KFD_IOCTL_SVM_FLAG_GPU_EXEC = 0x00000010 # macro
|
||||
KFD_IOCTL_SVM_FLAG_GPU_READ_MOSTLY = 0x00000020 # macro
|
||||
AMDKFD_IOCTL_BASE = 'K' # macro
|
||||
# def AMDKFD_IO(nr): # macro
|
||||
# return _IO('K',nr)
|
||||
# def AMDKFD_IOR(nr, type): # macro
|
||||
# return _IOR('K',nr,type)
|
||||
# def AMDKFD_IOW(nr, type): # macro
|
||||
# return _IOW('K',nr,type)
|
||||
# def AMDKFD_IOWR(nr, type): # macro
|
||||
# return _IOWR('K',nr,type)
|
||||
# AMDKFD_IOC_GET_VERSION = _IOR('K',nr,type) ( 0x01 , struct kfd_ioctl_get_version_args ) # macro
|
||||
# AMDKFD_IOC_CREATE_QUEUE = _IOWR('K',nr,type) ( 0x02 , struct kfd_ioctl_create_queue_args ) # macro
|
||||
# AMDKFD_IOC_DESTROY_QUEUE = _IOWR('K',nr,type) ( 0x03 , struct kfd_ioctl_destroy_queue_args ) # macro
|
||||
# AMDKFD_IOC_SET_MEMORY_POLICY = _IOW('K',nr,type) ( 0x04 , struct kfd_ioctl_set_memory_policy_args ) # macro
|
||||
# AMDKFD_IOC_GET_CLOCK_COUNTERS = _IOWR('K',nr,type) ( 0x05 , struct kfd_ioctl_get_clock_counters_args ) # macro
|
||||
# AMDKFD_IOC_GET_PROCESS_APERTURES = _IOR('K',nr,type) ( 0x06 , struct kfd_ioctl_get_process_apertures_args ) # macro
|
||||
# AMDKFD_IOC_UPDATE_QUEUE = _IOW('K',nr,type) ( 0x07 , struct kfd_ioctl_update_queue_args ) # macro
|
||||
# AMDKFD_IOC_CREATE_EVENT = _IOWR('K',nr,type) ( 0x08 , struct kfd_ioctl_create_event_args ) # macro
|
||||
# AMDKFD_IOC_DESTROY_EVENT = _IOW('K',nr,type) ( 0x09 , struct kfd_ioctl_destroy_event_args ) # macro
|
||||
# AMDKFD_IOC_SET_EVENT = _IOW('K',nr,type) ( 0x0A , struct kfd_ioctl_set_event_args ) # macro
|
||||
# AMDKFD_IOC_RESET_EVENT = _IOW('K',nr,type) ( 0x0B , struct kfd_ioctl_reset_event_args ) # macro
|
||||
# AMDKFD_IOC_WAIT_EVENTS = _IOWR('K',nr,type) ( 0x0C , struct kfd_ioctl_wait_events_args ) # macro
|
||||
# AMDKFD_IOC_DBG_REGISTER = _IOW('K',nr,type) ( 0x0D , struct kfd_ioctl_dbg_register_args ) # macro
|
||||
# AMDKFD_IOC_DBG_UNREGISTER = _IOW('K',nr,type) ( 0x0E , struct kfd_ioctl_dbg_unregister_args ) # macro
|
||||
# AMDKFD_IOC_DBG_ADDRESS_WATCH = _IOW('K',nr,type) ( 0x0F , struct kfd_ioctl_dbg_address_watch_args ) # macro
|
||||
# AMDKFD_IOC_DBG_WAVE_CONTROL = _IOW('K',nr,type) ( 0x10 , struct kfd_ioctl_dbg_wave_control_args ) # macro
|
||||
# AMDKFD_IOC_SET_SCRATCH_BACKING_VA = _IOWR('K',nr,type) ( 0x11 , struct kfd_ioctl_set_scratch_backing_va_args ) # macro
|
||||
# AMDKFD_IOC_GET_TILE_CONFIG = _IOWR('K',nr,type) ( 0x12 , struct kfd_ioctl_get_tile_config_args ) # macro
|
||||
# AMDKFD_IOC_SET_TRAP_HANDLER = _IOW('K',nr,type) ( 0x13 , struct kfd_ioctl_set_trap_handler_args ) # macro
|
||||
# AMDKFD_IOC_GET_PROCESS_APERTURES_NEW = _IOWR('K',nr,type) ( 0x14 , struct kfd_ioctl_get_process_apertures_new_args ) # macro
|
||||
# AMDKFD_IOC_ACQUIRE_VM = _IOW('K',nr,type) ( 0x15 , struct kfd_ioctl_acquire_vm_args ) # macro
|
||||
# AMDKFD_IOC_ALLOC_MEMORY_OF_GPU = _IOWR('K',nr,type) ( 0x16 , struct kfd_ioctl_alloc_memory_of_gpu_args ) # macro
|
||||
# AMDKFD_IOC_FREE_MEMORY_OF_GPU = _IOW('K',nr,type) ( 0x17 , struct kfd_ioctl_free_memory_of_gpu_args ) # macro
|
||||
# AMDKFD_IOC_MAP_MEMORY_TO_GPU = _IOWR('K',nr,type) ( 0x18 , struct kfd_ioctl_map_memory_to_gpu_args ) # macro
|
||||
# AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU = _IOWR('K',nr,type) ( 0x19 , struct kfd_ioctl_unmap_memory_from_gpu_args ) # macro
|
||||
# AMDKFD_IOC_SET_CU_MASK = _IOW('K',nr,type) ( 0x1A , struct kfd_ioctl_set_cu_mask_args ) # macro
|
||||
# AMDKFD_IOC_GET_QUEUE_WAVE_STATE = _IOWR('K',nr,type) ( 0x1B , struct kfd_ioctl_get_queue_wave_state_args ) # macro
|
||||
# AMDKFD_IOC_GET_DMABUF_INFO = _IOWR('K',nr,type) ( 0x1C , struct kfd_ioctl_get_dmabuf_info_args ) # macro
|
||||
# AMDKFD_IOC_IMPORT_DMABUF = _IOWR('K',nr,type) ( 0x1D , struct kfd_ioctl_import_dmabuf_args ) # macro
|
||||
# AMDKFD_IOC_ALLOC_QUEUE_GWS = _IOWR('K',nr,type) ( 0x1E , struct kfd_ioctl_alloc_queue_gws_args ) # macro
|
||||
# AMDKFD_IOC_SMI_EVENTS = _IOWR('K',nr,type) ( 0x1F , struct kfd_ioctl_smi_events_args ) # macro
|
||||
# AMDKFD_IOC_SVM = _IOWR('K',nr,type) ( 0x20 , struct kfd_ioctl_svm_args ) # macro
|
||||
# AMDKFD_IOC_SET_XNACK_MODE = _IOWR('K',nr,type) ( 0x21 , struct kfd_ioctl_set_xnack_mode_args ) # macro
|
||||
AMDKFD_COMMAND_START = 0x01 # macro
|
||||
AMDKFD_COMMAND_END = 0x22 # macro
|
||||
class struct_kfd_ioctl_get_version_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_get_version_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_get_version_args._fields_ = [
|
||||
('major_version', ctypes.c_uint32),
|
||||
('minor_version', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_create_queue_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_create_queue_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_create_queue_args._fields_ = [
|
||||
('ring_base_address', ctypes.c_uint64),
|
||||
('write_pointer_address', ctypes.c_uint64),
|
||||
('read_pointer_address', ctypes.c_uint64),
|
||||
('doorbell_offset', ctypes.c_uint64),
|
||||
('ring_size', ctypes.c_uint32),
|
||||
('gpu_id', ctypes.c_uint32),
|
||||
('queue_type', ctypes.c_uint32),
|
||||
('queue_percentage', ctypes.c_uint32),
|
||||
('queue_priority', ctypes.c_uint32),
|
||||
('queue_id', ctypes.c_uint32),
|
||||
('eop_buffer_address', ctypes.c_uint64),
|
||||
('eop_buffer_size', ctypes.c_uint64),
|
||||
('ctx_save_restore_address', ctypes.c_uint64),
|
||||
('ctx_save_restore_size', ctypes.c_uint32),
|
||||
('ctl_stack_size', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_destroy_queue_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_destroy_queue_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_destroy_queue_args._fields_ = [
|
||||
('queue_id', ctypes.c_uint32),
|
||||
('pad', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_update_queue_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_update_queue_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_update_queue_args._fields_ = [
|
||||
('ring_base_address', ctypes.c_uint64),
|
||||
('queue_id', ctypes.c_uint32),
|
||||
('ring_size', ctypes.c_uint32),
|
||||
('queue_percentage', ctypes.c_uint32),
|
||||
('queue_priority', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_set_cu_mask_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_set_cu_mask_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_set_cu_mask_args._fields_ = [
|
||||
('queue_id', ctypes.c_uint32),
|
||||
('num_cu_mask', ctypes.c_uint32),
|
||||
('cu_mask_ptr', ctypes.c_uint64),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_get_queue_wave_state_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_get_queue_wave_state_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_get_queue_wave_state_args._fields_ = [
|
||||
('ctl_stack_address', ctypes.c_uint64),
|
||||
('ctl_stack_used_size', ctypes.c_uint32),
|
||||
('save_area_used_size', ctypes.c_uint32),
|
||||
('queue_id', ctypes.c_uint32),
|
||||
('pad', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_set_memory_policy_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_set_memory_policy_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_set_memory_policy_args._fields_ = [
|
||||
('alternate_aperture_base', ctypes.c_uint64),
|
||||
('alternate_aperture_size', ctypes.c_uint64),
|
||||
('gpu_id', ctypes.c_uint32),
|
||||
('default_policy', ctypes.c_uint32),
|
||||
('alternate_policy', ctypes.c_uint32),
|
||||
('pad', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_get_clock_counters_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_get_clock_counters_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_get_clock_counters_args._fields_ = [
|
||||
('gpu_clock_counter', ctypes.c_uint64),
|
||||
('cpu_clock_counter', ctypes.c_uint64),
|
||||
('system_clock_counter', ctypes.c_uint64),
|
||||
('system_clock_freq', ctypes.c_uint64),
|
||||
('gpu_id', ctypes.c_uint32),
|
||||
('pad', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_process_device_apertures(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_process_device_apertures._pack_ = 1 # source:False
|
||||
struct_kfd_process_device_apertures._fields_ = [
|
||||
('lds_base', ctypes.c_uint64),
|
||||
('lds_limit', ctypes.c_uint64),
|
||||
('scratch_base', ctypes.c_uint64),
|
||||
('scratch_limit', ctypes.c_uint64),
|
||||
('gpuvm_base', ctypes.c_uint64),
|
||||
('gpuvm_limit', ctypes.c_uint64),
|
||||
('gpu_id', ctypes.c_uint32),
|
||||
('pad', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_get_process_apertures_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_get_process_apertures_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_get_process_apertures_args._fields_ = [
|
||||
('process_apertures', struct_kfd_process_device_apertures * 7),
|
||||
('num_of_nodes', ctypes.c_uint32),
|
||||
('pad', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_get_process_apertures_new_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_get_process_apertures_new_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_get_process_apertures_new_args._fields_ = [
|
||||
('kfd_process_device_apertures_ptr', ctypes.c_uint64),
|
||||
('num_of_nodes', ctypes.c_uint32),
|
||||
('pad', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_dbg_register_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_dbg_register_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_dbg_register_args._fields_ = [
|
||||
('gpu_id', ctypes.c_uint32),
|
||||
('pad', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_dbg_unregister_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_dbg_unregister_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_dbg_unregister_args._fields_ = [
|
||||
('gpu_id', ctypes.c_uint32),
|
||||
('pad', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_dbg_address_watch_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_dbg_address_watch_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_dbg_address_watch_args._fields_ = [
|
||||
('content_ptr', ctypes.c_uint64),
|
||||
('gpu_id', ctypes.c_uint32),
|
||||
('buf_size_in_bytes', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_dbg_wave_control_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_dbg_wave_control_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_dbg_wave_control_args._fields_ = [
|
||||
('content_ptr', ctypes.c_uint64),
|
||||
('gpu_id', ctypes.c_uint32),
|
||||
('buf_size_in_bytes', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_create_event_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_create_event_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_create_event_args._fields_ = [
|
||||
('event_page_offset', ctypes.c_uint64),
|
||||
('event_trigger_data', ctypes.c_uint32),
|
||||
('event_type', ctypes.c_uint32),
|
||||
('auto_reset', ctypes.c_uint32),
|
||||
('node_id', ctypes.c_uint32),
|
||||
('event_id', ctypes.c_uint32),
|
||||
('event_slot_index', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_destroy_event_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_destroy_event_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_destroy_event_args._fields_ = [
|
||||
('event_id', ctypes.c_uint32),
|
||||
('pad', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_set_event_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_set_event_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_set_event_args._fields_ = [
|
||||
('event_id', ctypes.c_uint32),
|
||||
('pad', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_reset_event_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_reset_event_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_reset_event_args._fields_ = [
|
||||
('event_id', ctypes.c_uint32),
|
||||
('pad', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_memory_exception_failure(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_memory_exception_failure._pack_ = 1 # source:False
|
||||
struct_kfd_memory_exception_failure._fields_ = [
|
||||
('NotPresent', ctypes.c_uint32),
|
||||
('ReadOnly', ctypes.c_uint32),
|
||||
('NoExecute', ctypes.c_uint32),
|
||||
('imprecise', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_hsa_memory_exception_data(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_hsa_memory_exception_data._pack_ = 1 # source:False
|
||||
struct_kfd_hsa_memory_exception_data._fields_ = [
|
||||
('failure', struct_kfd_memory_exception_failure),
|
||||
('va', ctypes.c_uint64),
|
||||
('gpu_id', ctypes.c_uint32),
|
||||
('ErrorType', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_hsa_hw_exception_data(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_hsa_hw_exception_data._pack_ = 1 # source:False
|
||||
struct_kfd_hsa_hw_exception_data._fields_ = [
|
||||
('reset_type', ctypes.c_uint32),
|
||||
('reset_cause', ctypes.c_uint32),
|
||||
('memory_lost', ctypes.c_uint32),
|
||||
('gpu_id', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_event_data(Structure):
|
||||
pass
|
||||
|
||||
class union_kfd_event_data_0(Union):
|
||||
pass
|
||||
|
||||
union_kfd_event_data_0._pack_ = 1 # source:False
|
||||
union_kfd_event_data_0._fields_ = [
|
||||
('memory_exception_data', struct_kfd_hsa_memory_exception_data),
|
||||
('hw_exception_data', struct_kfd_hsa_hw_exception_data),
|
||||
('PADDING_0', ctypes.c_ubyte * 16),
|
||||
]
|
||||
|
||||
struct_kfd_event_data._pack_ = 1 # source:False
|
||||
struct_kfd_event_data._anonymous_ = ('_0',)
|
||||
struct_kfd_event_data._fields_ = [
|
||||
('_0', union_kfd_event_data_0),
|
||||
('kfd_event_data_ext', ctypes.c_uint64),
|
||||
('event_id', ctypes.c_uint32),
|
||||
('pad', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_wait_events_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_wait_events_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_wait_events_args._fields_ = [
|
||||
('events_ptr', ctypes.c_uint64),
|
||||
('num_events', ctypes.c_uint32),
|
||||
('wait_for_all', ctypes.c_uint32),
|
||||
('timeout', ctypes.c_uint32),
|
||||
('wait_result', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_set_scratch_backing_va_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_set_scratch_backing_va_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_set_scratch_backing_va_args._fields_ = [
|
||||
('va_addr', ctypes.c_uint64),
|
||||
('gpu_id', ctypes.c_uint32),
|
||||
('pad', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_get_tile_config_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_get_tile_config_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_get_tile_config_args._fields_ = [
|
||||
('tile_config_ptr', ctypes.c_uint64),
|
||||
('macro_tile_config_ptr', ctypes.c_uint64),
|
||||
('num_tile_configs', ctypes.c_uint32),
|
||||
('num_macro_tile_configs', ctypes.c_uint32),
|
||||
('gpu_id', ctypes.c_uint32),
|
||||
('gb_addr_config', ctypes.c_uint32),
|
||||
('num_banks', ctypes.c_uint32),
|
||||
('num_ranks', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_set_trap_handler_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_set_trap_handler_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_set_trap_handler_args._fields_ = [
|
||||
('tba_addr', ctypes.c_uint64),
|
||||
('tma_addr', ctypes.c_uint64),
|
||||
('gpu_id', ctypes.c_uint32),
|
||||
('pad', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_acquire_vm_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_acquire_vm_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_acquire_vm_args._fields_ = [
|
||||
('drm_fd', ctypes.c_uint32),
|
||||
('gpu_id', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_alloc_memory_of_gpu_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_alloc_memory_of_gpu_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_alloc_memory_of_gpu_args._fields_ = [
|
||||
('va_addr', ctypes.c_uint64),
|
||||
('size', ctypes.c_uint64),
|
||||
('handle', ctypes.c_uint64),
|
||||
('mmap_offset', ctypes.c_uint64),
|
||||
('gpu_id', ctypes.c_uint32),
|
||||
('flags', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_free_memory_of_gpu_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_free_memory_of_gpu_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_free_memory_of_gpu_args._fields_ = [
|
||||
('handle', ctypes.c_uint64),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_map_memory_to_gpu_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_map_memory_to_gpu_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_map_memory_to_gpu_args._fields_ = [
|
||||
('handle', ctypes.c_uint64),
|
||||
('device_ids_array_ptr', ctypes.c_uint64),
|
||||
('n_devices', ctypes.c_uint32),
|
||||
('n_success', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_unmap_memory_from_gpu_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_unmap_memory_from_gpu_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_unmap_memory_from_gpu_args._fields_ = [
|
||||
('handle', ctypes.c_uint64),
|
||||
('device_ids_array_ptr', ctypes.c_uint64),
|
||||
('n_devices', ctypes.c_uint32),
|
||||
('n_success', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_alloc_queue_gws_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_alloc_queue_gws_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_alloc_queue_gws_args._fields_ = [
|
||||
('queue_id', ctypes.c_uint32),
|
||||
('num_gws', ctypes.c_uint32),
|
||||
('first_gws', ctypes.c_uint32),
|
||||
('pad', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_get_dmabuf_info_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_get_dmabuf_info_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_get_dmabuf_info_args._fields_ = [
|
||||
('size', ctypes.c_uint64),
|
||||
('metadata_ptr', ctypes.c_uint64),
|
||||
('metadata_size', ctypes.c_uint32),
|
||||
('gpu_id', ctypes.c_uint32),
|
||||
('flags', ctypes.c_uint32),
|
||||
('dmabuf_fd', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_import_dmabuf_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_import_dmabuf_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_import_dmabuf_args._fields_ = [
|
||||
('va_addr', ctypes.c_uint64),
|
||||
('handle', ctypes.c_uint64),
|
||||
('gpu_id', ctypes.c_uint32),
|
||||
('dmabuf_fd', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
|
||||
# values for enumeration 'kfd_smi_event'
|
||||
kfd_smi_event__enumvalues = {
|
||||
0: 'KFD_SMI_EVENT_NONE',
|
||||
1: 'KFD_SMI_EVENT_VMFAULT',
|
||||
2: 'KFD_SMI_EVENT_THERMAL_THROTTLE',
|
||||
3: 'KFD_SMI_EVENT_GPU_PRE_RESET',
|
||||
4: 'KFD_SMI_EVENT_GPU_POST_RESET',
|
||||
}
|
||||
KFD_SMI_EVENT_NONE = 0
|
||||
KFD_SMI_EVENT_VMFAULT = 1
|
||||
KFD_SMI_EVENT_THERMAL_THROTTLE = 2
|
||||
KFD_SMI_EVENT_GPU_PRE_RESET = 3
|
||||
KFD_SMI_EVENT_GPU_POST_RESET = 4
|
||||
kfd_smi_event = ctypes.c_uint32 # enum
|
||||
class struct_kfd_ioctl_smi_events_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_smi_events_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_smi_events_args._fields_ = [
|
||||
('gpuid', ctypes.c_uint32),
|
||||
('anon_fd', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
|
||||
# values for enumeration 'kfd_mmio_remap'
|
||||
kfd_mmio_remap__enumvalues = {
|
||||
0: 'KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL',
|
||||
4: 'KFD_MMIO_REMAP_HDP_REG_FLUSH_CNTL',
|
||||
}
|
||||
KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL = 0
|
||||
KFD_MMIO_REMAP_HDP_REG_FLUSH_CNTL = 4
|
||||
kfd_mmio_remap = ctypes.c_uint32 # enum
|
||||
|
||||
# values for enumeration 'kfd_ioctl_svm_op'
|
||||
kfd_ioctl_svm_op__enumvalues = {
|
||||
0: 'KFD_IOCTL_SVM_OP_SET_ATTR',
|
||||
1: 'KFD_IOCTL_SVM_OP_GET_ATTR',
|
||||
}
|
||||
KFD_IOCTL_SVM_OP_SET_ATTR = 0
|
||||
KFD_IOCTL_SVM_OP_GET_ATTR = 1
|
||||
kfd_ioctl_svm_op = ctypes.c_uint32 # enum
|
||||
|
||||
# values for enumeration 'kfd_ioctl_svm_location'
|
||||
kfd_ioctl_svm_location__enumvalues = {
|
||||
0: 'KFD_IOCTL_SVM_LOCATION_SYSMEM',
|
||||
4294967295: 'KFD_IOCTL_SVM_LOCATION_UNDEFINED',
|
||||
}
|
||||
KFD_IOCTL_SVM_LOCATION_SYSMEM = 0
|
||||
KFD_IOCTL_SVM_LOCATION_UNDEFINED = 4294967295
|
||||
kfd_ioctl_svm_location = ctypes.c_uint32 # enum
|
||||
|
||||
# values for enumeration 'kfd_ioctl_svm_attr_type'
|
||||
kfd_ioctl_svm_attr_type__enumvalues = {
|
||||
0: 'KFD_IOCTL_SVM_ATTR_PREFERRED_LOC',
|
||||
1: 'KFD_IOCTL_SVM_ATTR_PREFETCH_LOC',
|
||||
2: 'KFD_IOCTL_SVM_ATTR_ACCESS',
|
||||
3: 'KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE',
|
||||
4: 'KFD_IOCTL_SVM_ATTR_NO_ACCESS',
|
||||
5: 'KFD_IOCTL_SVM_ATTR_SET_FLAGS',
|
||||
6: 'KFD_IOCTL_SVM_ATTR_CLR_FLAGS',
|
||||
7: 'KFD_IOCTL_SVM_ATTR_GRANULARITY',
|
||||
}
|
||||
KFD_IOCTL_SVM_ATTR_PREFERRED_LOC = 0
|
||||
KFD_IOCTL_SVM_ATTR_PREFETCH_LOC = 1
|
||||
KFD_IOCTL_SVM_ATTR_ACCESS = 2
|
||||
KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE = 3
|
||||
KFD_IOCTL_SVM_ATTR_NO_ACCESS = 4
|
||||
KFD_IOCTL_SVM_ATTR_SET_FLAGS = 5
|
||||
KFD_IOCTL_SVM_ATTR_CLR_FLAGS = 6
|
||||
KFD_IOCTL_SVM_ATTR_GRANULARITY = 7
|
||||
kfd_ioctl_svm_attr_type = ctypes.c_uint32 # enum
|
||||
class struct_kfd_ioctl_svm_attribute(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_svm_attribute._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_svm_attribute._fields_ = [
|
||||
('type', ctypes.c_uint32),
|
||||
('value', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_svm_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_svm_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_svm_args._fields_ = [
|
||||
('start_addr', ctypes.c_uint64),
|
||||
('size', ctypes.c_uint64),
|
||||
('op', ctypes.c_uint32),
|
||||
('nattr', ctypes.c_uint32),
|
||||
('attrs', struct_kfd_ioctl_svm_attribute * 0),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_set_xnack_mode_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_set_xnack_mode_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_set_xnack_mode_args._fields_ = [
|
||||
('xnack_enabled', ctypes.c_int32),
|
||||
]
|
||||
|
||||
__all__ = \
|
||||
['AMDKFD_COMMAND_END', 'AMDKFD_COMMAND_START',
|
||||
'AMDKFD_IOCTL_BASE', 'KFD_HW_EXCEPTION_ECC',
|
||||
'KFD_HW_EXCEPTION_GPU_HANG', 'KFD_HW_EXCEPTION_PER_ENGINE_RESET',
|
||||
'KFD_HW_EXCEPTION_WHOLE_GPU_RESET', 'KFD_IOCTL_H_INCLUDED',
|
||||
'KFD_IOCTL_MAJOR_VERSION', 'KFD_IOCTL_MINOR_VERSION',
|
||||
'KFD_IOCTL_SVM_ATTR_ACCESS', 'KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE',
|
||||
'KFD_IOCTL_SVM_ATTR_CLR_FLAGS', 'KFD_IOCTL_SVM_ATTR_GRANULARITY',
|
||||
'KFD_IOCTL_SVM_ATTR_NO_ACCESS',
|
||||
'KFD_IOCTL_SVM_ATTR_PREFERRED_LOC',
|
||||
'KFD_IOCTL_SVM_ATTR_PREFETCH_LOC', 'KFD_IOCTL_SVM_ATTR_SET_FLAGS',
|
||||
'KFD_IOCTL_SVM_FLAG_COHERENT', 'KFD_IOCTL_SVM_FLAG_GPU_EXEC',
|
||||
'KFD_IOCTL_SVM_FLAG_GPU_READ_MOSTLY', 'KFD_IOCTL_SVM_FLAG_GPU_RO',
|
||||
'KFD_IOCTL_SVM_FLAG_HIVE_LOCAL', 'KFD_IOCTL_SVM_FLAG_HOST_ACCESS',
|
||||
'KFD_IOCTL_SVM_LOCATION_SYSMEM',
|
||||
'KFD_IOCTL_SVM_LOCATION_UNDEFINED', 'KFD_IOCTL_SVM_OP_GET_ATTR',
|
||||
'KFD_IOCTL_SVM_OP_SET_ATTR',
|
||||
'KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM',
|
||||
'KFD_IOC_ALLOC_MEM_FLAGS_COHERENT',
|
||||
'KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL',
|
||||
'KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE',
|
||||
'KFD_IOC_ALLOC_MEM_FLAGS_GTT',
|
||||
'KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP',
|
||||
'KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE',
|
||||
'KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC',
|
||||
'KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED',
|
||||
'KFD_IOC_ALLOC_MEM_FLAGS_USERPTR', 'KFD_IOC_ALLOC_MEM_FLAGS_VRAM',
|
||||
'KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE',
|
||||
'KFD_IOC_CACHE_POLICY_COHERENT',
|
||||
'KFD_IOC_CACHE_POLICY_NONCOHERENT', 'KFD_IOC_EVENT_DEBUG_EVENT',
|
||||
'KFD_IOC_EVENT_DEVICESTATECHANGE', 'KFD_IOC_EVENT_HW_EXCEPTION',
|
||||
'KFD_IOC_EVENT_MEMORY', 'KFD_IOC_EVENT_NODECHANGE',
|
||||
'KFD_IOC_EVENT_PROFILE_EVENT', 'KFD_IOC_EVENT_QUEUE_EVENT',
|
||||
'KFD_IOC_EVENT_SIGNAL', 'KFD_IOC_EVENT_SYSTEM_EVENT',
|
||||
'KFD_IOC_QUEUE_TYPE_COMPUTE', 'KFD_IOC_QUEUE_TYPE_COMPUTE_AQL',
|
||||
'KFD_IOC_QUEUE_TYPE_SDMA', 'KFD_IOC_QUEUE_TYPE_SDMA_XGMI',
|
||||
'KFD_IOC_WAIT_RESULT_COMPLETE', 'KFD_IOC_WAIT_RESULT_FAIL',
|
||||
'KFD_IOC_WAIT_RESULT_TIMEOUT', 'KFD_MAX_QUEUE_PERCENTAGE',
|
||||
'KFD_MAX_QUEUE_PRIORITY', 'KFD_MEM_ERR_GPU_HANG',
|
||||
'KFD_MEM_ERR_NO_RAS', 'KFD_MEM_ERR_POISON_CONSUMED',
|
||||
'KFD_MEM_ERR_SRAM_ECC', 'KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL',
|
||||
'KFD_MMIO_REMAP_HDP_REG_FLUSH_CNTL', 'KFD_SIGNAL_EVENT_LIMIT',
|
||||
'KFD_SMI_EVENT_GPU_POST_RESET', 'KFD_SMI_EVENT_GPU_PRE_RESET',
|
||||
'KFD_SMI_EVENT_NONE', 'KFD_SMI_EVENT_THERMAL_THROTTLE',
|
||||
'KFD_SMI_EVENT_VMFAULT', 'MAX_ALLOWED_AW_BUFF_SIZE',
|
||||
'MAX_ALLOWED_NUM_POINTS', 'MAX_ALLOWED_WAC_BUFF_SIZE',
|
||||
'NUM_OF_SUPPORTED_GPUS', 'kfd_ioctl_svm_attr_type',
|
||||
'kfd_ioctl_svm_location', 'kfd_ioctl_svm_op', 'kfd_mmio_remap',
|
||||
'kfd_smi_event', 'struct_kfd_event_data',
|
||||
'struct_kfd_hsa_hw_exception_data',
|
||||
'struct_kfd_hsa_memory_exception_data',
|
||||
'struct_kfd_ioctl_acquire_vm_args',
|
||||
'struct_kfd_ioctl_alloc_memory_of_gpu_args',
|
||||
'struct_kfd_ioctl_alloc_queue_gws_args',
|
||||
'struct_kfd_ioctl_create_event_args',
|
||||
'struct_kfd_ioctl_create_queue_args',
|
||||
'struct_kfd_ioctl_dbg_address_watch_args',
|
||||
'struct_kfd_ioctl_dbg_register_args',
|
||||
'struct_kfd_ioctl_dbg_unregister_args',
|
||||
'struct_kfd_ioctl_dbg_wave_control_args',
|
||||
'struct_kfd_ioctl_destroy_event_args',
|
||||
'struct_kfd_ioctl_destroy_queue_args',
|
||||
'struct_kfd_ioctl_free_memory_of_gpu_args',
|
||||
'struct_kfd_ioctl_get_clock_counters_args',
|
||||
'struct_kfd_ioctl_get_dmabuf_info_args',
|
||||
'struct_kfd_ioctl_get_process_apertures_args',
|
||||
'struct_kfd_ioctl_get_process_apertures_new_args',
|
||||
'struct_kfd_ioctl_get_queue_wave_state_args',
|
||||
'struct_kfd_ioctl_get_tile_config_args',
|
||||
'struct_kfd_ioctl_get_version_args',
|
||||
'struct_kfd_ioctl_import_dmabuf_args',
|
||||
'struct_kfd_ioctl_map_memory_to_gpu_args',
|
||||
'struct_kfd_ioctl_reset_event_args',
|
||||
'struct_kfd_ioctl_set_cu_mask_args',
|
||||
'struct_kfd_ioctl_set_event_args',
|
||||
'struct_kfd_ioctl_set_memory_policy_args',
|
||||
'struct_kfd_ioctl_set_scratch_backing_va_args',
|
||||
'struct_kfd_ioctl_set_trap_handler_args',
|
||||
'struct_kfd_ioctl_set_xnack_mode_args',
|
||||
'struct_kfd_ioctl_smi_events_args', 'struct_kfd_ioctl_svm_args',
|
||||
'struct_kfd_ioctl_svm_attribute',
|
||||
'struct_kfd_ioctl_unmap_memory_from_gpu_args',
|
||||
'struct_kfd_ioctl_update_queue_args',
|
||||
'struct_kfd_ioctl_wait_events_args',
|
||||
'struct_kfd_memory_exception_failure',
|
||||
'struct_kfd_process_device_apertures', 'union_kfd_event_data_0']
|
||||
333
tinygrad/runtime/ops_kfd.py
Normal file
333
tinygrad/runtime/ops_kfd.py
Normal file
@@ -0,0 +1,333 @@
|
||||
from __future__ import annotations
|
||||
from typing import Tuple
|
||||
import os, fcntl, ctypes, functools, re, pathlib, mmap, struct
|
||||
from tinygrad.device import Compiled, LRUAllocator, Compiler, BufferOptions, CompilerOptions
|
||||
from tinygrad.helpers import getenv, from_mv, init_c_struct_t, to_mv, round_up
|
||||
from tinygrad.renderer.cstyle import HIPRenderer
|
||||
from tinygrad.runtime.driver.hip_comgr import compile_hip
|
||||
import tinygrad.runtime.autogen.kfd as kfd
|
||||
import tinygrad.runtime.autogen.hsa as hsa
|
||||
import tinygrad.runtime.autogen.amd_sdma as amd_sdma
|
||||
if getenv("IOCTL"): import extra.hip_gpu_driver.hip_ioctl # noqa: F401
|
||||
|
||||
libc = ctypes.CDLL("libc.so.6")
|
||||
libc.mmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long]
|
||||
libc.mmap.restype = ctypes.c_void_p
|
||||
|
||||
def node_sysfs_path(node_id, file): return f"/sys/devices/virtual/kfd/kfd/topology/nodes/{node_id}/{file}"
|
||||
|
||||
def kfd_ioctl(idir, nr, user_struct, fd, made_struct=None, **kwargs):
|
||||
made = made_struct or user_struct(**kwargs)
|
||||
ret = fcntl.ioctl(fd, (idir<<30) | (ctypes.sizeof(made)<<16) | (ord('K')<<8) | nr, made)
|
||||
if ret != 0: raise RuntimeError(f"ioctl returned {ret}")
|
||||
return made
|
||||
|
||||
def ioctls_from_header():
|
||||
#hdr = pathlib.Path("/usr/include/linux/kfd_ioctl.h").read_text().replace("\\\n", "")
|
||||
#pattern = r'#define\s+(AMDKFD_IOC_[A-Z0-9_]+)\s+AMDKFD_(IOW?R?)\((0x[0-9a-fA-F]+),\s+struct\s([A-Za-z0-9_]+)\)'
|
||||
#matches = re.findall(pattern, hdr, re.MULTILINE)
|
||||
# get this from python instead
|
||||
hdrpy = (pathlib.Path(__file__).parent / "autogen" / "kfd.py").read_text()
|
||||
pattern = r'# (AMDKFD_IOC_[A-Z0-9_]+)\s=\s_(IOW?R?).*\(( 0x[0-9a-fA-F]+) ,\s+struct\s([A-Za-z0-9_]+)\s+\)'
|
||||
matches = re.findall(pattern, hdrpy, re.MULTILINE)
|
||||
idirs = {"IOW": 1, "IOR": 2, "IOWR": 3}
|
||||
fxns = {name.replace("AMDKFD_IOC_", "").lower():
|
||||
functools.partial(kfd_ioctl, idirs[idir], int(nr, 0x10), getattr(kfd, "struct_"+sname))
|
||||
for name, idir, nr, sname in matches}
|
||||
return type("KIO", (object, ), fxns)
|
||||
kio = ioctls_from_header()
|
||||
|
||||
def create_sdma_packets():
|
||||
# TODO: clean up this, if we want to keep it
|
||||
structs = {}
|
||||
for name,pkt in [(name,s) for name,s in amd_sdma.__dict__.items() if name.startswith("struct_SDMA_PKT_") and name.endswith("_TAG")]:
|
||||
names = set()
|
||||
fields = []
|
||||
for pkt_fields in pkt._fields_:
|
||||
if not pkt_fields[0].endswith("_UNION"): fields.append(pkt_fields)
|
||||
else:
|
||||
assert pkt_fields[1]._fields_[0][0] == '_0'
|
||||
for union_fields in pkt_fields[1]._fields_[0][1]._fields_:
|
||||
fname = union_fields[0]
|
||||
if fname in names: fname = pkt_fields[0]+fname
|
||||
names.add(fname)
|
||||
if fname.endswith("_63_32") and fields[-1][0].endswith("_31_0"):
|
||||
fields[-1] = tuple([fname[:-6], ctypes.c_ulong, 64]) # merge together 64-bit fields
|
||||
else:
|
||||
fields.append(tuple([fname, *union_fields[1:]]))
|
||||
new_name = name[16:-4].lower()
|
||||
structs[new_name] = init_c_struct_t(tuple(fields))
|
||||
assert ctypes.sizeof(structs[new_name]) == ctypes.sizeof(pkt), f"{ctypes.sizeof(structs[new_name])} != {ctypes.sizeof(pkt)}"
|
||||
return type("SDMA_PKTS", (object, ), structs)
|
||||
sdma_pkts = create_sdma_packets()
|
||||
|
||||
class KFDCompiler(Compiler):
|
||||
compiler_opts = CompilerOptions("KFD", has_tensor_cores=True, shared_max=65536)
|
||||
def __init__(self, arch:str):
|
||||
self.arch = arch
|
||||
super().__init__(f"compile_hip_{self.arch}")
|
||||
def render(self, name:str, uops) -> str: return HIPRenderer(name, uops)
|
||||
def compile(self, src:str) -> bytes: return compile_hip(src, self.arch)
|
||||
|
||||
AQL_PACKET_SIZE = ctypes.sizeof(hsa.hsa_kernel_dispatch_packet_t)
|
||||
SDMA_MAX_COPY_SIZE = 0x400000
|
||||
|
||||
DISPATCH_KERNEL_SETUP = 3 << hsa.HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS
|
||||
DISPATCH_KERNEL_HEADER = 1 << hsa.HSA_PACKET_HEADER_BARRIER
|
||||
DISPATCH_KERNEL_HEADER |= hsa.HSA_FENCE_SCOPE_SYSTEM << hsa.HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE
|
||||
DISPATCH_KERNEL_HEADER |= hsa.HSA_FENCE_SCOPE_SYSTEM << hsa.HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE
|
||||
DISPATCH_KERNEL_HEADER |= hsa.HSA_PACKET_TYPE_KERNEL_DISPATCH << hsa.HSA_PACKET_HEADER_TYPE
|
||||
|
||||
SHT_PROGBITS = 0x1
|
||||
SHF_ALLOC = 0x2
|
||||
|
||||
class KFDProgram:
|
||||
def __init__(self, device:KFDDevice, name:str, lib:bytes):
|
||||
# TODO; this API needs the type signature of the function and global_size/local_size
|
||||
self.device, self.name, self.lib = device, name, lib
|
||||
|
||||
_phoff, _shoff, _flags, _ehsize, _phentsize, _phnum, _shentsize, _shnum, _shstrndx = struct.unpack_from("<QQIHHHHHH", self.lib, 0x20)
|
||||
sections = [struct.unpack_from("<IIQQQQIIQ", self.lib, _shoff + i * _shentsize) for i in range(_shnum)]
|
||||
|
||||
lib_gpu_size = round_up(max(sh[5]+sh[3] for sh in sections if sh[1] == SHT_PROGBITS), 0x1000)
|
||||
self.lib_gpu = self.device._gpu_alloc(lib_gpu_size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM, public=True)
|
||||
lib_gpu_view = to_mv(self.lib_gpu.va_addr, lib_gpu_size)
|
||||
|
||||
for _, sh_type, sh_flags, sh_addr, sh_offset, sh_size, _, _, _ in sections:
|
||||
if sh_type == SHT_PROGBITS and sh_flags & SHF_ALLOC: lib_gpu_view[sh_addr:sh_addr+sh_size] = self.lib[sh_offset:sh_offset+sh_size]
|
||||
|
||||
entry_point = min(sh[3] for sh in sections if sh[1] == SHT_PROGBITS and sh[2] & SHF_ALLOC)
|
||||
self.handle = self.lib_gpu.va_addr + entry_point
|
||||
self.group_segment_size = lib_gpu_view.cast("I")[entry_point//4]
|
||||
self.private_segment_size = lib_gpu_view.cast("I")[entry_point//4 + 1]
|
||||
self.kernargs_segment_size = lib_gpu_view.cast("I")[entry_point//4 + 2]
|
||||
assert self.private_segment_size <= self.device.max_private_segment_size, \
|
||||
f"{self.private_segment_size=} > {self.device.max_private_segment_size=}"
|
||||
|
||||
# NOTE: no programs are ever freed
|
||||
def __del__(self): kio.free_memory_of_gpu(KFDDevice.kfd, handle=self.lib_gpu.handle)
|
||||
|
||||
def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
|
||||
if not hasattr(self, "args_struct_t"):
|
||||
self.args_struct_t = init_c_struct_t(tuple([(f'f{i}', ctypes.c_void_p) for i in range(len(args))] +
|
||||
[(f'v{i}', ctypes.c_int) for i in range(len(vals))]))
|
||||
if ctypes.sizeof(self.args_struct_t) != self.kernargs_segment_size:
|
||||
raise RuntimeError(f"HSAProgram.__call__: incorrect args struct size {ctypes.sizeof(self.args_struct_t)} != {self.kernargs_segment_size}")
|
||||
args_st = self.args_struct_t.from_address(self.device.kernargs.va_addr)
|
||||
for i in range(len(args)): args_st.__setattr__(f'f{i}', args[i].va_addr)
|
||||
for i in range(len(vals)): args_st.__setattr__(f'v{i}', vals[i])
|
||||
|
||||
self.device.completion_signal.value = 1 # reset the signal before call
|
||||
packet = hsa.hsa_kernel_dispatch_packet_t.from_address(self.device.aql_ring.va_addr +
|
||||
(self.device.aql_doorbell_value*AQL_PACKET_SIZE) % self.device.aql_ring.size)
|
||||
packet.workgroup_size_x, packet.workgroup_size_y, packet.workgroup_size_z = local_size
|
||||
packet.reserved0 = 0
|
||||
packet.grid_size_x, packet.grid_size_y, packet.grid_size_z = tuple(g*l for g,l in zip(global_size, local_size))
|
||||
packet.kernel_object = self.handle
|
||||
packet.kernarg_address = self.device.kernargs.va_addr
|
||||
packet.group_segment_size = self.group_segment_size
|
||||
packet.private_segment_size = self.private_segment_size # what it this and why doesn't it work? (see TestOps.test_dilated_conv_transpose2d)
|
||||
packet.reserved2 = 0
|
||||
packet.completion_signal = hsa.hsa_signal_t(ctypes.addressof(self.device.completion_signal))
|
||||
packet.setup = DISPATCH_KERNEL_SETUP
|
||||
packet.header = DISPATCH_KERNEL_HEADER
|
||||
|
||||
# one pending packet + ring doorbell
|
||||
self.device.amd_aql_queue.write_dispatch_id = self.device.aql_doorbell_value + 1
|
||||
self.device.aql_doorbell[0] = self.device.aql_doorbell_value
|
||||
self.device.aql_doorbell_value += 1
|
||||
|
||||
evt_arr = (kfd.struct_kfd_event_data * 1)()
|
||||
evt_arr[0].event_id = self.device.completion_signal.event_id
|
||||
kio.wait_events(KFDDevice.kfd, events_ptr=ctypes.addressof(evt_arr), num_events=1, wait_for_all=1, timeout=1000)
|
||||
|
||||
assert (wp:=self.device.amd_aql_queue.write_dispatch_id) == (rp:=self.device.amd_aql_queue.read_dispatch_id), f"didn't run {wp} != {rp}"
|
||||
if wait: return (self.device.completion_signal.end_ts-self.device.completion_signal.start_ts)/1e9
|
||||
|
||||
class KFDAllocator(LRUAllocator):
|
||||
def __init__(self, device:KFDDevice):
|
||||
self.device = device
|
||||
super().__init__()
|
||||
|
||||
def _alloc(self, size:int, options:BufferOptions):
|
||||
if options.host: return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, public=True)
|
||||
else: return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM, public=True)
|
||||
|
||||
def copyin(self, dest, src: memoryview):
|
||||
# TODO: need to make the address visible to gpu and pass it directly to sdma.
|
||||
self.device._map_userptr_to_gpu(ctypes.addressof(from_mv(src).contents), src.nbytes)
|
||||
self.device.completion_signal.value = 1
|
||||
self.device._submit_sdma(dest.va_addr, ctypes.addressof(from_mv(src).contents), src.nbytes, completion_signal=self.device.completion_signal)
|
||||
evt_arr = (kfd.struct_kfd_event_data * 1)()
|
||||
evt_arr[0].event_id = self.device.completion_signal.event_id
|
||||
kio.wait_events(KFDDevice.kfd, events_ptr=ctypes.addressof(evt_arr), num_events=1, wait_for_all=1, timeout=1000)
|
||||
|
||||
def copyout(self, dest:memoryview, src):
|
||||
self.device._map_userptr_to_gpu(ctypes.addressof(from_mv(dest).contents), dest.nbytes)
|
||||
self.device.completion_signal.value = 1
|
||||
self.device._submit_sdma(ctypes.addressof(from_mv(dest).contents), src.va_addr, dest.nbytes, completion_signal=self.device.completion_signal)
|
||||
evt_arr = (kfd.struct_kfd_event_data * 1)()
|
||||
evt_arr[0].event_id = self.device.completion_signal.event_id
|
||||
kio.wait_events(KFDDevice.kfd, events_ptr=ctypes.addressof(evt_arr), num_events=1, wait_for_all=1, timeout=1000)
|
||||
|
||||
MAP_FIXED, MAP_NORESERVE = 0x10, 0x400
|
||||
class KFDDevice(Compiled):
|
||||
kfd:int = -1
|
||||
|
||||
def _map_userptr_to_gpu(self, addr, size):
|
||||
self.map_uptr2gpu_struct.start_addr = addr&~0xfff
|
||||
self.map_uptr2gpu_struct.size = round_up(size+addr-(addr&~0xfff), 0x1000)
|
||||
kio.svm(self.kfd, made_struct=self.map_uptr2gpu_struct)
|
||||
|
||||
def _gpu_alloc(self, size:int, flags:int, uncached=False, public=False, map_to_gpu=True):
|
||||
flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE
|
||||
if uncached: flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_COHERENT | kfd.KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED
|
||||
if public: flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC
|
||||
if flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR:
|
||||
buf = addr = libc.mmap(0, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED|mmap.MAP_ANONYMOUS, -1, 0)
|
||||
else:
|
||||
buf, addr = 0, libc.mmap(0, size, 0, mmap.MAP_PRIVATE|mmap.MAP_ANONYMOUS|MAP_NORESERVE, -1, 0)
|
||||
assert addr != 0xffffffffffffffff
|
||||
mem = kio.alloc_memory_of_gpu(self.kfd, va_addr=addr, size=size, gpu_id=self.gpu_id, flags=flags, mmap_offset=buf)
|
||||
if not (flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR):
|
||||
buf = libc.mmap(mem.va_addr, mem.size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED|MAP_FIXED, self.drm_fd, mem.mmap_offset)
|
||||
assert buf != 0xffffffffffffffff
|
||||
assert addr == buf == mem.va_addr
|
||||
if map_to_gpu:
|
||||
arr = (ctypes.c_int32 * 1)(self.gpu_id)
|
||||
stm = kio.map_memory_to_gpu(self.kfd, handle=mem.handle, device_ids_array_ptr=ctypes.addressof(arr), n_devices=1)
|
||||
assert stm.n_success == 1
|
||||
return mem
|
||||
|
||||
def __init__(self, device:str=""):
|
||||
if KFDDevice.kfd == -1: KFDDevice.kfd = os.open("/dev/kfd", os.O_RDWR)
|
||||
self.device_id = int(device.split(":")[1]) if ":" in device else 0
|
||||
with open(node_sysfs_path(self.device_id+1, "gpu_id"), "r") as f: self.gpu_id = int(f.read())
|
||||
with open(node_sysfs_path(self.device_id+1, "properties"), "r") as f: self.properties = {line.split()[0]: int(line.split()[1]) for line in f}
|
||||
self.drm_fd = os.open(f"/dev/dri/renderD{self.properties['drm_render_minor']}", os.O_RDWR)
|
||||
self.arch = f"gfx{self.properties['gfx_target_version']//100}"
|
||||
kio.acquire_vm(KFDDevice.kfd, drm_fd=self.drm_fd, gpu_id=self.gpu_id)
|
||||
|
||||
self.event_page = self._gpu_alloc(0x8000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
||||
self.sync_event = kio.create_event(KFDDevice.kfd, event_page_offset=self.event_page.handle, auto_reset=1)
|
||||
self.eop_buffer = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
|
||||
self.aql_ring = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, uncached=True)
|
||||
self.signals_page = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, uncached=True)
|
||||
self.gart = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
||||
self.kernargs = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
|
||||
self.ctx_save_restore_address = self._gpu_alloc(0x2C02000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
|
||||
|
||||
self.completion_signal = hsa.amd_signal_t.from_address(self.signals_page.va_addr)
|
||||
self.completion_signal.value = 1
|
||||
self.completion_signal.kind = hsa.AMD_SIGNAL_KIND_USER
|
||||
self.completion_signal.event_mailbox_ptr = self.event_page.va_addr + self.sync_event.event_slot_index*8
|
||||
self.completion_signal.event_id = self.sync_event.event_id
|
||||
|
||||
# AQL Queue
|
||||
self.amd_aql_queue = hsa.amd_queue_t.from_address(self.gart.va_addr)
|
||||
self.amd_aql_queue.write_dispatch_id = 0
|
||||
self.amd_aql_queue.read_dispatch_id = 0
|
||||
self.amd_aql_queue.read_dispatch_id_field_base_byte_offset = getattr(hsa.amd_queue_t, 'read_dispatch_id').offset
|
||||
self.amd_aql_queue.queue_properties = hsa.AMD_QUEUE_PROPERTIES_IS_PTR64 | hsa.AMD_QUEUE_PROPERTIES_ENABLE_PROFILING
|
||||
|
||||
self.amd_aql_queue.max_cu_id = self.properties['simd_count'] // self.properties['simd_per_cu'] - 1
|
||||
self.amd_aql_queue.max_wave_id = self.properties['max_waves_per_simd'] * self.properties['simd_per_cu'] - 1
|
||||
|
||||
# scratch setup
|
||||
self.max_private_segment_size = 256
|
||||
self.scratch_len = self.max_private_segment_size * (self.amd_aql_queue.max_cu_id + 1) * (self.amd_aql_queue.max_wave_id + 1)
|
||||
self.scratch = self._gpu_alloc(self.scratch_len, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
|
||||
self.amd_aql_queue.scratch_backing_memory_location = self.scratch.va_addr
|
||||
self.amd_aql_queue.scratch_backing_memory_byte_size = self.scratch_len
|
||||
self.amd_aql_queue.scratch_wave64_lane_byte_size = self.max_private_segment_size * (self.amd_aql_queue.max_wave_id + 1) // 64
|
||||
self.amd_aql_queue.scratch_resource_descriptor[0] = self.scratch.va_addr & 0xFFFFFFFF
|
||||
self.amd_aql_queue.scratch_resource_descriptor[1] = ((self.scratch.va_addr >> 32) & 0xFFFF) | (1 << 30) # va_hi | SWIZZLE_ENABLE
|
||||
self.amd_aql_queue.scratch_resource_descriptor[2] = self.scratch_len & 0xFFFFFFFF
|
||||
self.amd_aql_queue.scratch_resource_descriptor[3] = 0x20814fac # FORMAT=BUF_FORMAT_32_UINT,OOB_SELECT=2,ADD_TID_ENABLE=1,TYPE=SQ_RSRC_BUF,SQ_SELs
|
||||
|
||||
wave_scratch = (((self.amd_aql_queue.max_wave_id + 1) * self.max_private_segment_size + 255) // 256)
|
||||
self.amd_aql_queue.compute_tmpring_size = wave_scratch << 12 | (self.amd_aql_queue.max_cu_id + 1)
|
||||
|
||||
self.aql_queue = kio.create_queue(KFDDevice.kfd, ring_base_address=self.aql_ring.va_addr, ring_size=self.aql_ring.size, gpu_id=self.gpu_id,
|
||||
queue_type=kfd.KFD_IOC_QUEUE_TYPE_COMPUTE_AQL, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
|
||||
eop_buffer_address=self.eop_buffer.va_addr, eop_buffer_size=self.eop_buffer.size,
|
||||
ctx_save_restore_address=self.ctx_save_restore_address.va_addr, ctx_save_restore_size=self.ctx_save_restore_address.size,
|
||||
ctl_stack_size = 0xa000,
|
||||
write_pointer_address=self.gart.va_addr + getattr(hsa.amd_queue_t, 'write_dispatch_id').offset,
|
||||
read_pointer_address=self.gart.va_addr + getattr(hsa.amd_queue_t, 'read_dispatch_id').offset)
|
||||
|
||||
self.doorbells_base = self.aql_queue.doorbell_offset & (~0xfff)
|
||||
self.doorbells = libc.mmap(0, 8192, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, KFDDevice.kfd, self.doorbells_base)
|
||||
self.aql_doorbell = to_mv(self.doorbells + self.aql_queue.doorbell_offset - self.doorbells_base, 4).cast("I")
|
||||
self.aql_doorbell_value = 0
|
||||
|
||||
# SDMA Queue
|
||||
self.sdma_ring = self._gpu_alloc(1 << 20, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, uncached=True)
|
||||
self.sdma_queue = kio.create_queue(KFDDevice.kfd, ring_base_address=self.sdma_ring.va_addr, ring_size=self.sdma_ring.size, gpu_id=self.gpu_id,
|
||||
queue_type=kfd.KFD_IOC_QUEUE_TYPE_SDMA, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
|
||||
write_pointer_address=self.gart.va_addr + 0x100, read_pointer_address=self.gart.va_addr + 0x108)
|
||||
|
||||
self.sdma_read_pointer = to_mv(self.sdma_queue.read_pointer_address, 8).cast("Q")
|
||||
self.sdma_write_pointer = to_mv(self.sdma_queue.write_pointer_address, 8).cast("Q")
|
||||
self.sdma_doorbell = to_mv(self.doorbells + self.sdma_queue.doorbell_offset - self.doorbells_base, 4).cast("I")
|
||||
self.sdma_doorbell_value = 0
|
||||
|
||||
# prebuilt packets
|
||||
self.sdma_flush_hdp_pkt = sdma_pkts.hdp_flush(0x8, 0x0, 0x80000000, 0x0, 0x0, 0x0)
|
||||
self.sdma_cache_inv = sdma_pkts.gcr(op=amd_sdma.SDMA_OP_GCR, sub_op=amd_sdma.SDMA_SUBOP_USER_GCR, GCR_CONTROL_GL2_WB=1, GCR_CONTROL_GLK_WB=1,
|
||||
GCR_CONTROL_GL2_INV=1, GCR_CONTROL_GL1_INV=1, GCR_CONTROL_GLV_INV=1, GCR_CONTROL_GLK_INV=1,
|
||||
GCR_CONTROL_GL2_RANGE=0)
|
||||
self.sdma_cache_wb = sdma_pkts.gcr(op=amd_sdma.SDMA_OP_GCR, sub_op=amd_sdma.SDMA_SUBOP_USER_GCR, GCR_CONTROL_GL2_WB=1, GCR_CONTROL_GLK_WB=1,
|
||||
GCR_CONTROL_GL2_RANGE=0)
|
||||
|
||||
# Helpers
|
||||
map_uptr2gpu_struct_t = init_c_struct_t(tuple(kfd.struct_kfd_ioctl_svm_args._fields_[:-1]+[('attrs', kfd.struct_kfd_ioctl_svm_attribute*2)])) # type: ignore
|
||||
self.map_uptr2gpu_struct = map_uptr2gpu_struct_t(nattr=2, op=0x0)
|
||||
self.map_uptr2gpu_struct.attrs[0].type = kfd.KFD_IOCTL_SVM_ATTR_SET_FLAGS
|
||||
self.map_uptr2gpu_struct.attrs[0].value = kfd.KFD_IOCTL_SVM_FLAG_COHERENT
|
||||
self.map_uptr2gpu_struct.attrs[1].type = kfd.KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE
|
||||
self.map_uptr2gpu_struct.attrs[1].value = self.gpu_id
|
||||
|
||||
super().__init__(device, KFDAllocator(self), KFDCompiler(self.arch), functools.partial(KFDProgram, self))
|
||||
|
||||
def _submit_sdma(self, dest, src, copy_size, wait_signals=None, completion_signal=None):
|
||||
def blit_sdma_command(cmd):
|
||||
ctypes.memmove(self.sdma_ring.va_addr + (self.sdma_doorbell_value % self.sdma_ring.size), ctypes.addressof(cmd), sz:=ctypes.sizeof(cmd))
|
||||
self.sdma_doorbell_value += sz
|
||||
|
||||
if wait_signals is not None:
|
||||
# NOTE: we check only low 32 bits to be zeroed, we don't use higher values for signals
|
||||
for sig in wait_signals:
|
||||
poll_addr = ctypes.addressof(sig) + getattr(hsa.amd_signal_t, 'value').offset
|
||||
blit_sdma_command(sdma_pkts.poll_regmem(op=amd_sdma.SDMA_OP_POLL_REGMEM, mem_poll=1, func=0x3, addr=poll_addr,
|
||||
value=0, mask=0xffffffff, interval=0x04, retry_count=0xfff))
|
||||
|
||||
if completion_signal is not None:
|
||||
blit_sdma_command(sdma_pkts.timestamp(op=amd_sdma.SDMA_OP_TIMESTAMP, sub_op=amd_sdma.SDMA_SUBOP_TIMESTAMP_GET_GLOBAL,
|
||||
addr=ctypes.addressof(completion_signal) + getattr(hsa.amd_signal_t, 'start_ts').offset))
|
||||
blit_sdma_command(self.sdma_flush_hdp_pkt)
|
||||
blit_sdma_command(self.sdma_cache_inv)
|
||||
|
||||
copied = 0
|
||||
copies_commands = (copy_size + SDMA_MAX_COPY_SIZE - 1) // SDMA_MAX_COPY_SIZE
|
||||
for _ in range(copies_commands):
|
||||
step_copy_size = min(copy_size - copied, SDMA_MAX_COPY_SIZE)
|
||||
blit_sdma_command(sdma_pkts.copy_linear(op=amd_sdma.SDMA_OP_COPY, sub_op=amd_sdma.SDMA_SUBOP_COPY_LINEAR,
|
||||
count=step_copy_size-1, src_addr=src+copied, dst_addr=dest+copied))
|
||||
copied += step_copy_size
|
||||
|
||||
blit_sdma_command(self.sdma_cache_wb)
|
||||
if completion_signal is not None:
|
||||
blit_sdma_command(sdma_pkts.timestamp(op=amd_sdma.SDMA_OP_TIMESTAMP, sub_op=amd_sdma.SDMA_SUBOP_TIMESTAMP_GET_GLOBAL,
|
||||
addr=ctypes.addressof(completion_signal) + getattr(hsa.amd_signal_t, 'end_ts').offset))
|
||||
|
||||
if completion_signal is not None:
|
||||
signal_addr = ctypes.addressof(completion_signal) + getattr(hsa.amd_signal_t, 'value').offset
|
||||
blit_sdma_command(sdma_pkts.atomic(op=amd_sdma.SDMA_OP_ATOMIC, operation=amd_sdma.SDMA_ATOMIC_ADD64, addr=signal_addr, src_data=(1<<64)-1))
|
||||
if completion_signal.event_mailbox_ptr != 0:
|
||||
blit_sdma_command(sdma_pkts.fence(op=amd_sdma.SDMA_OP_FENCE, mtype=3, addr=completion_signal.event_mailbox_ptr,
|
||||
data=completion_signal.event_id))
|
||||
blit_sdma_command(sdma_pkts.trap(op=amd_sdma.SDMA_OP_TRAP, int_ctx=completion_signal.event_id))
|
||||
|
||||
self.sdma_write_pointer[0] = self.sdma_doorbell_value
|
||||
self.sdma_doorbell[0] = self.sdma_doorbell_value
|
||||
Reference in New Issue
Block a user