SQTT profiling (#9278)

* sqtt

* docs

* multi-device

* ProfileSQTTEvent

* exec update

* 256mb default

* don't let people hang their gpus

* bitfields from autogen

* asic info from mesa

* more bitfields from autogen

* SQTT_ITRACE_SE_MASK

---------

Co-authored-by: George Hotz <72895+geohot@users.noreply.github.com>
This commit is contained in:
uuuvn
2025-03-11 10:19:56 +05:00
committed by GitHub
parent 2780e2027e
commit e85001b6ee
9 changed files with 3164 additions and 9 deletions

View File

@@ -118,12 +118,15 @@ jobs:
cp tinygrad/runtime/autogen/hsa.py /tmp/hsa.py.bak
cp tinygrad/runtime/autogen/comgr.py /tmp/comgr.py.bak
cp tinygrad/runtime/autogen/amd_gpu.py /tmp/amd_gpu.py.bak
cp tinygrad/runtime/autogen/sqtt.py /tmp/sqtt.py.bak
./autogen_stubs.sh hsa
./autogen_stubs.sh comgr
./autogen_stubs.sh amd
./autogen_stubs.sh sqtt
diff /tmp/hsa.py.bak tinygrad/runtime/autogen/hsa.py
diff /tmp/comgr.py.bak tinygrad/runtime/autogen/comgr.py
diff /tmp/amd_gpu.py.bak tinygrad/runtime/autogen/amd_gpu.py
diff /tmp/sqtt.py.bak tinygrad/runtime/autogen/sqtt.py
- name: Verify Linux autogen
run: |
cp tinygrad/runtime/autogen/libc.py /tmp/libc.py.bak

View File

@@ -362,6 +362,16 @@ generate_am() {
fixup $BASE/am/hdp_6_0_0.py
}
generate_sqtt() {
clang2py -k cdefstum \
extra/sqtt/sqtt.h \
-o $BASE/sqtt.py
fixup $BASE/sqtt.py
sed -i "s\import ctypes\import ctypes, os\g" $BASE/sqtt.py
python3 -c "import tinygrad.runtime.autogen.sqtt"
}
generate_webgpu() {
clang2py -l /usr/local/lib/libwebgpu_dawn.so extra/webgpu/webgpu.h -o $BASE/webgpu.py
fixup $BASE/webgpu.py
@@ -380,6 +390,7 @@ elif [ "$1" == "kfd" ]; then generate_kfd
elif [ "$1" == "nv" ]; then generate_nv
elif [ "$1" == "amd" ]; then generate_amd
elif [ "$1" == "am" ]; then generate_am
elif [ "$1" == "sqtt" ]; then generate_sqtt
elif [ "$1" == "qcom" ]; then generate_qcom
elif [ "$1" == "io_uring" ]; then generate_io_uring
elif [ "$1" == "libc" ]; then generate_libc

33
extra/sqtt/README.md Normal file
View File

@@ -0,0 +1,33 @@
# SQTT Profiling
## Getting SQ Thread Trace
Only supported on 7900XTX, requires either AM (`rmmod amdgpu`) or disabling power gating on AMD (`ppfeaturemask=0xffff3fff`, don't forget to rebuild initramfs)
SQTT is implemented on top of normal tinygrad PROFILE=1, `PROFILE=1 SQTT=1` to get profile pickle with sqtt data embedded in it.
`SQTT_BUFFER_SIZE=X` to change size of SQTT buffer (per shader engine, 6 SEs on 7900xtx) in megabytes, default 256.
`SQTT_ITRACE_SE_MASK=X` to select for which shader engines instruction tracing will be enabled, -1 is all, 0 is none (instruction tracing disabled), >0 is
bitfield/mask for SEs to enable instruction tracing on. Masking shader engines will give smaller file sizes at a cost of less hits and kernels that
don't have any wavefront on first simd of shdaer engine with instruction tracing enabled will not have instruction timings.
The default is 2 (second shader engine only), only one for file size reasons, second instead of first because dispatch starts from it so there is
greater chance that kernels with small global size will have instruction tracing data.
Note that instruction tracing might not be available for kernels with small global dims, this is not a bug, but it can be improved with various hacks
to the point where it can reliably trace a kernel consisting of a single wavefront (am only, not quite reliable under amdgpu due to waves sometimes
being dispatched starting from different simds). More info in comments in ops_amd.py
## Converting pickled profile with SQTT data into RGP file
```bash
extra/sqtt/rgptool.py create "/tmp/profile.pkl.$USER" -o /tmp/gpu0.rgp
```
Then load gpu0.rgp into Radeon GPU Profiler. It works just fine both in wine (macos, native version available for linux) and via ssh X forwarding
If multiplle gpus are used you can select which one to export with `-d` like this:
```bash
extra/sqtt/rgptool.py create "/tmp/profile.pkl.$USER" -d 'AMD:5' -o /tmp/gpu5.rgp
```

330
extra/sqtt/rgptool.py Executable file
View File

@@ -0,0 +1,330 @@
#!/usr/bin/env python3
from __future__ import annotations
import argparse, ctypes, struct, hashlib, pickle, code, typing, functools
import tinygrad.runtime.autogen.sqtt as sqtt
from tinygrad.device import ProfileEvent, ProfileDeviceEvent, ProfileProgramEvent
from tinygrad.runtime.ops_amd import ProfileSQTTEvent
from tinygrad.helpers import round_up, flatten, all_same
from dataclasses import dataclass
CHUNK_CLASSES = {
sqtt.SQTT_FILE_CHUNK_TYPE_ASIC_INFO: sqtt.struct_sqtt_file_chunk_asic_info,
sqtt.SQTT_FILE_CHUNK_TYPE_SQTT_DESC: sqtt.struct_sqtt_file_chunk_sqtt_desc,
sqtt.SQTT_FILE_CHUNK_TYPE_SQTT_DATA: sqtt.struct_sqtt_file_chunk_sqtt_data,
sqtt.SQTT_FILE_CHUNK_TYPE_API_INFO: sqtt.struct_sqtt_file_chunk_api_info,
sqtt.SQTT_FILE_CHUNK_TYPE_QUEUE_EVENT_TIMINGS: sqtt.struct_sqtt_file_chunk_queue_event_timings,
sqtt.SQTT_FILE_CHUNK_TYPE_CLOCK_CALIBRATION: sqtt.struct_sqtt_file_chunk_clock_calibration,
sqtt.SQTT_FILE_CHUNK_TYPE_CPU_INFO: sqtt.struct_sqtt_file_chunk_cpu_info,
sqtt.SQTT_FILE_CHUNK_TYPE_SPM_DB: sqtt.struct_sqtt_file_chunk_spm_db,
sqtt.SQTT_FILE_CHUNK_TYPE_CODE_OBJECT_DATABASE: sqtt.struct_sqtt_file_chunk_code_object_database,
sqtt.SQTT_FILE_CHUNK_TYPE_CODE_OBJECT_LOADER_EVENTS: sqtt.struct_sqtt_file_chunk_code_object_loader_events,
sqtt.SQTT_FILE_CHUNK_TYPE_PSO_CORRELATION: sqtt.struct_sqtt_file_chunk_pso_correlation,
}
def pretty(val, pad=0) -> str:
if isinstance(val, ctypes.Structure) or isinstance(val, ctypes.Union):
nl = '\n' # old python versions don't support \ in f-strings
return f"{val.__class__.__name__}({nl}{' '*(pad+2)}{(f', {nl}'+' '*(pad+2)).join([f'{field[0]}={pretty(getattr(val, field[0]), pad=pad+2)}' for field in val._fields_])}{nl}{' '*pad})"
if isinstance(val, ctypes.Array):
return f"[{', '.join(map(pretty, val))}]"
if isinstance(val, int) and val >= 1024: return hex(val)
return repr(val)
@dataclass(frozen=True)
class RGPChunk:
header: sqtt.Structure
data: list[typing.Any]|list[tuple[typing.Any, bytes]]|bytes|None = None
def print(self):
print(pretty(self.header))
# if isinstance(self.data, bytes): print(repr(self.data))
if isinstance(self.data, list):
for dchunk in self.data:
if isinstance(dchunk, tuple):
print(pretty(dchunk[0]))
# print(repr(dchunk[1]))
else:
print(pretty(dchunk))
# TODO: `def fixup` and true immutability
def to_bytes(self, offset:int) -> bytes:
cid = self.header.header.chunk_id.type
match cid:
case _ if cid in {sqtt.SQTT_FILE_CHUNK_TYPE_ASIC_INFO, sqtt.SQTT_FILE_CHUNK_TYPE_CPU_INFO, sqtt.SQTT_FILE_CHUNK_TYPE_API_INFO, sqtt.SQTT_FILE_CHUNK_TYPE_SQTT_DESC}:
self.header.header.size_in_bytes = ctypes.sizeof(self.header)
return bytes(self.header)
case sqtt.SQTT_FILE_CHUNK_TYPE_SQTT_DATA:
assert isinstance(self.data, bytes)
self.header.header.size_in_bytes = ctypes.sizeof(self.header) + len(self.data)
self.header.offset = offset+ctypes.sizeof(self.header)
self.header.size = len(self.data)
return bytes(self.header) + self.data
case sqtt.SQTT_FILE_CHUNK_TYPE_CODE_OBJECT_DATABASE:
assert isinstance(self.data, list)
data_codb = typing.cast(list[tuple[sqtt.struct_sqtt_code_object_database_record, bytes]], self.data)
ret = bytearray()
sz = ctypes.sizeof(self.header)+sum([ctypes.sizeof(record_hdr)+round_up(len(record_blob), 4) for record_hdr,record_blob in data_codb])
self.header.header.size_in_bytes = sz
self.header.offset = offset
self.header.record_count = len(data_codb)
self.header.size = sz
ret += self.header
for record_hdr,record_blob in data_codb:
record_hdr.size = round_up(len(record_blob), 4)
ret += record_hdr
ret += record_blob.ljust(4, b'\x00')
return ret
case sqtt.SQTT_FILE_CHUNK_TYPE_CODE_OBJECT_LOADER_EVENTS:
assert isinstance(self.data, list)
data_lev = typing.cast(list[tuple[sqtt.struct_sqtt_code_object_loader_events_record]], self.data)
self.header.header.size_in_bytes = ctypes.sizeof(self.header)+ctypes.sizeof(sqtt.struct_sqtt_code_object_loader_events_record)*len(data_lev)
self.header.offset = offset
self.header.record_size = ctypes.sizeof(sqtt.struct_sqtt_code_object_loader_events_record)
self.header.record_count = len(data_lev)
return bytes(self.header) + b''.join(map(bytes, data_lev))
case sqtt.SQTT_FILE_CHUNK_TYPE_PSO_CORRELATION:
assert isinstance(self.data, list)
data_pso = typing.cast(list[tuple[sqtt.struct_sqtt_pso_correlation_record]], self.data)
self.header.header.size_in_bytes = ctypes.sizeof(self.header)+ctypes.sizeof(sqtt.struct_sqtt_pso_correlation_record)*len(data_pso)
self.header.offset = offset
self.header.record_size = ctypes.sizeof(sqtt.struct_sqtt_pso_correlation_record)
self.header.record_count = len(data_pso)
return bytes(self.header) + b''.join(map(bytes, data_pso))
case _: raise NotImplementedError(pretty(self.header))
@dataclass(frozen=True)
class RGP:
header: sqtt.struct_sqtt_file_header
chunks: list[RGPChunk]
@staticmethod
def from_bytes(blob: bytes) -> RGP:
file_header = sqtt.struct_sqtt_file_header.from_buffer_copy(blob)
assert file_header.magic_number == sqtt.SQTT_FILE_MAGIC_NUMBER and file_header.version_major == sqtt.SQTT_FILE_VERSION_MAJOR
i = file_header.chunk_offset
chunks = []
while i < len(blob):
assert i%4==0, hex(i)
hdr = sqtt.struct_sqtt_file_chunk_header.from_buffer_copy(blob, i)
cid = hdr.chunk_id.type
header: ctypes.Structure
match cid:
case _ if cid in {sqtt.SQTT_FILE_CHUNK_TYPE_RESERVED, sqtt.SQTT_FILE_CHUNK_TYPE_QUEUE_EVENT_TIMINGS, sqtt.SQTT_FILE_CHUNK_TYPE_CLOCK_CALIBRATION, sqtt.SQTT_FILE_CHUNK_TYPE_SPM_DB}:
chunk = None
case sqtt.SQTT_FILE_CHUNK_TYPE_CODE_OBJECT_DATABASE:
header = sqtt.struct_sqtt_file_chunk_code_object_database.from_buffer_copy(blob, i)
j = header.offset + ctypes.sizeof(header)
data: list = []
while j < header.offset + header.size:
rec_hdr: ctypes.Structure = sqtt.struct_sqtt_code_object_database_record.from_buffer_copy(blob, j)
data.append((rec_hdr, elf:=blob[j+ctypes.sizeof(rec_hdr):j+ctypes.sizeof(rec_hdr)+rec_hdr.size]))
assert elf[:4] == b'\x7fELF', repr(elf[:16])
j += ctypes.sizeof(rec_hdr)+rec_hdr.size
assert len(data) == header.record_count
chunk = RGPChunk(header, data)
case sqtt.SQTT_FILE_CHUNK_TYPE_CODE_OBJECT_LOADER_EVENTS:
header = sqtt.struct_sqtt_file_chunk_code_object_loader_events.from_buffer_copy(blob, i)
data = [sqtt.struct_sqtt_code_object_loader_events_record.from_buffer_copy(blob, header.offset+ctypes.sizeof(header)+j*header.record_size)
for j in range(header.record_count)]
chunk = RGPChunk(header, data)
case sqtt.SQTT_FILE_CHUNK_TYPE_PSO_CORRELATION:
header = sqtt.struct_sqtt_file_chunk_pso_correlation.from_buffer_copy(blob, i)
data = [sqtt.struct_sqtt_pso_correlation_record.from_buffer_copy(blob, header.offset+ctypes.sizeof(header)+j*header.record_size)
for j in range(header.record_count)]
chunk = RGPChunk(header, data)
case sqtt.SQTT_FILE_CHUNK_TYPE_SQTT_DATA:
header = sqtt.struct_sqtt_file_chunk_sqtt_data.from_buffer_copy(blob, i)
chunk = RGPChunk(header, blob[header.offset:header.offset+header.size])
case _ if cid in {sqtt.SQTT_FILE_CHUNK_TYPE_ASIC_INFO, sqtt.SQTT_FILE_CHUNK_TYPE_CPU_INFO, sqtt.SQTT_FILE_CHUNK_TYPE_API_INFO,
sqtt.SQTT_FILE_CHUNK_TYPE_SQTT_DESC}:
chunk = RGPChunk(CHUNK_CLASSES[cid].from_buffer_copy(blob, i))
case _:
chunk = None
print(f"unknown chunk id {cid}")
if chunk is not None: chunks.append(chunk)
i += hdr.size_in_bytes
assert i == len(blob), f'{i} != {len(blob)}'
return RGP(file_header, chunks)
@staticmethod
def from_profile(profile_pickled, device:str|None=None):
profile: list[ProfileEvent] = pickle.loads(profile_pickled)
device_events = {x.device:x for x in profile if isinstance(x, ProfileDeviceEvent) and x.device.startswith('AMD')}
if device is None:
if len(device_events) == 0: raise RuntimeError('No supported devices found in profile')
if len(device_events) > 1: raise RuntimeError(f"More than one supported device found, select which one to export: {', '.join(device_events.keys())}")
_, device_event = device_events.popitem()
else:
if device not in device_events: raise RuntimeError(f"Device {device} not found in profile, devices in profile: {', '.join(device_events.keys())} ")
device_event = device_events[device]
sqtt_events = [x for x in profile if isinstance(x, ProfileSQTTEvent) and x.device == device_event.device]
if len(sqtt_events) == 0: raise RuntimeError(f"Device {device_event.device} doesn't contain SQTT data")
sqtt_itrace_enabled = any([event.itrace for event in sqtt_events])
sqtt_itrace_masked = not all_same([event.itrace for event in sqtt_events])
sqtt_itrace_se_mask = functools.reduce(lambda a,b: a|b, [int(event.itrace) << event.se for event in sqtt_events], 0) if sqtt_itrace_masked else 0
load_events = [x for x in profile if isinstance(x, ProfileProgramEvent) and x.device == device_event.device]
loads = [(event.base, struct.unpack('<Q', hashlib.md5(event.lib).digest()[:8])*2) for event in load_events if event.base is not None and event.lib is not None]
code_objects = list(dict.fromkeys([x.lib for x in load_events if x.lib is not None]).keys())
if len(loads) == 0: raise RuntimeError('No load events in profile')
# TODO: tons of stuff hardcoded for 7900xtx
file_header = sqtt.struct_sqtt_file_header(
magic_number=sqtt.SQTT_FILE_MAGIC_NUMBER,
version_major=sqtt.SQTT_FILE_VERSION_MAJOR,
version_minor=sqtt.SQTT_FILE_VERSION_MINOR,
flags=sqtt.struct_sqtt_file_header_flags(
_0=sqtt.union_sqtt_file_header_flags_0(value=1),
),
chunk_offset=ctypes.sizeof(sqtt.struct_sqtt_file_header),
)
chunks = [
RGPChunk(sqtt.struct_sqtt_file_chunk_cpu_info(
header=sqtt.struct_sqtt_file_chunk_header(
chunk_id=sqtt.struct_sqtt_file_chunk_id(type=sqtt.SQTT_FILE_CHUNK_TYPE_CPU_INFO),
major_version=0, minor_version=0,
),
cpu_timestamp_freq=1000000000,
clock_speed=2994, # in mhz???
num_logical_cores=64,
num_physical_cores=32,
system_ram_size=256*1024, # in mb???
)),
RGPChunk(sqtt.struct_sqtt_file_chunk_asic_info(
header=sqtt.struct_sqtt_file_chunk_header(
chunk_id=sqtt.struct_sqtt_file_chunk_id(type=sqtt.SQTT_FILE_CHUNK_TYPE_ASIC_INFO),
major_version=0, minor_version=5,
),
flags=0,
trace_shader_core_clock=0x93f05080,
trace_memory_clock=0x4a723a40,
device_id=0x744c,
device_revision_id=0xc8,
vgprs_per_simd=1536,
sgprs_per_simd=128*16,
shader_engines=6,
compute_unit_per_shader_engine=16,
simd_per_compute_unit=2,
wavefronts_per_simd=16,
minimum_vgpr_alloc=4,
vgpr_alloc_granularity=8,
minimum_sgpr_alloc=128,
sgpr_alloc_granularity=128,
hardware_contexts=8,
gpu_type=sqtt.SQTT_GPU_TYPE_DISCRETE,
gfxip_level=sqtt.SQTT_GFXIP_LEVEL_GFXIP_11_0,
gpu_index=0,
gds_size=0,
gds_per_shader_engine=0,
ce_ram_size=0,
ce_ram_size_graphics=0,
ce_ram_size_compute=0,
max_number_of_dedicated_cus=0,
vram_size=24 * 1024 * 1024 * 1024, # 24 GB
vram_bus_width=384, # 384-bit
l2_cache_size=6 * 1024 * 1024, # 6 MB
l1_cache_size=32 * 1024, # 32 KB per SIMD (?)
lds_size=65536, # 64 KB per CU
gpu_name=b'NAVI31',
alu_per_clock=0,
texture_per_clock=0,
prims_per_clock=6,
pixels_per_clock=0,
gpu_timestamp_frequency=100000000, # 100 MHz
max_shader_core_clock=2500000000, # 2.5 GHz (boost clock)
max_memory_clock=1250000000, # 1.25 GHz
memory_ops_per_clock=16,
memory_chip_type=sqtt.SQTT_MEMORY_TYPE_GDDR6,
lds_granularity=512,
cu_mask=((255, 255),)*6 + ((0,0),)*(32-6),
gl1_cache_size=256 * 1024, # 256 KB
instruction_cache_size=32 * 1024, # 32 KB
scalar_cache_size=16 * 1024, # 16 KB
mall_cache_size=96 * 1024 * 1024, # 96 MB
)),
RGPChunk(sqtt.struct_sqtt_file_chunk_api_info(
header=sqtt.struct_sqtt_file_chunk_header(
chunk_id=sqtt.struct_sqtt_file_chunk_id(type=sqtt.SQTT_FILE_CHUNK_TYPE_API_INFO),
major_version=0,
minor_version=2,
),
api_type=5, # HIP, not in enum
major_version=12, minor_version=0,
profiling_mode=sqtt.SQTT_PROFILING_MODE_PRESENT,
instruction_trace_mode=sqtt.SQTT_INSTRUCTION_TRACE_FULL_FRAME if sqtt_itrace_enabled else sqtt.SQTT_INSTRUCTION_TRACE_DISABLED,
instruction_trace_data=sqtt.union_sqtt_instruction_trace_data(
shader_engine_filter=sqtt.struct_sqtt_instruction_trace_data_shader_engine_filter(mask=sqtt_itrace_se_mask),
),
)),
*flatten([(
RGPChunk(sqtt.struct_sqtt_file_chunk_sqtt_desc(
header=sqtt.struct_sqtt_file_chunk_header(
chunk_id=sqtt.struct_sqtt_file_chunk_id(type=sqtt.SQTT_FILE_CHUNK_TYPE_SQTT_DESC, index=sqtt_event.se),
major_version=0, minor_version=2,
),
shader_engine_index=sqtt_event.se,
sqtt_version=sqtt.SQTT_VERSION_3_2,
_0=sqtt.union_sqtt_file_chunk_sqtt_desc_0(
v1=sqtt.struct_sqtt_file_chunk_sqtt_desc_0_v1(
instrumentation_spec_version=1,
instrumentation_api_version=0,
compute_unit_index=0,
)
),
)),
RGPChunk(sqtt.struct_sqtt_file_chunk_sqtt_data(
header=sqtt.struct_sqtt_file_chunk_header(
chunk_id=sqtt.struct_sqtt_file_chunk_id(type=sqtt.SQTT_FILE_CHUNK_TYPE_SQTT_DATA, index=sqtt_event.se),
major_version=0, minor_version=0,
),
), sqtt_event.blob),
) for sqtt_event in sqtt_events]),
RGPChunk(sqtt.struct_sqtt_file_chunk_code_object_database(
header=sqtt.struct_sqtt_file_chunk_header(
chunk_id=sqtt.struct_sqtt_file_chunk_id(type=sqtt.SQTT_FILE_CHUNK_TYPE_CODE_OBJECT_DATABASE),
major_version=0, minor_version=0,
),
), [(sqtt.struct_sqtt_code_object_database_record(), lib) for lib in code_objects]),
RGPChunk(sqtt.struct_sqtt_file_chunk_code_object_loader_events(
header=sqtt.struct_sqtt_file_chunk_header(
chunk_id=sqtt.struct_sqtt_file_chunk_id(type=sqtt.SQTT_FILE_CHUNK_TYPE_CODE_OBJECT_LOADER_EVENTS),
major_version=1, minor_version=0,
),
), [sqtt.struct_sqtt_code_object_loader_events_record(base_address=base, code_object_hash=hash) for base,hash in loads]),
RGPChunk(sqtt.struct_sqtt_file_chunk_pso_correlation(
header=sqtt.struct_sqtt_file_chunk_header(
chunk_id=sqtt.struct_sqtt_file_chunk_id(type=sqtt.SQTT_FILE_CHUNK_TYPE_PSO_CORRELATION),
major_version=0, minor_version=0,
),
), [sqtt.struct_sqtt_pso_correlation_record(api_pso_hash=hash[0], pipeline_hash=hash) for _,hash in loads])
]
return RGP(file_header, chunks)
def to_bytes(self) -> bytes:
ret = bytearray()
ret += self.header
for chunk in self.chunks:
ret += chunk.to_bytes(len(ret))
return bytes(ret)
def print(self):
print(pretty(self.header))
for chunk in self.chunks: chunk.print()
if __name__ == '__main__':
parser = argparse.ArgumentParser(prog='rgptool', description='A tool to create (from pickled tinygrad profile), inspect and modify Radeon GPU Profiler files')
parser.add_argument('command')
parser.add_argument('input')
parser.add_argument('-d', '--device')
parser.add_argument('-o', '--output')
args = parser.parse_args()
with open(args.input, 'rb') as fd: input_bytes = fd.read()
match args.command:
case 'print':
rgp = RGP.from_bytes(input_bytes)
rgp.print()
case 'create':
rgp = RGP.from_profile(input_bytes, device=args.device)
# rgp.to_bytes() # fixup
# rgp.print()
case 'repl':
rgp = RGP.from_bytes(input_bytes)
code.interact(local=locals())
case _: raise RuntimeError(args.command)
if args.output is not None:
with open(args.output, 'wb+') as fd: fd.write(rgp.to_bytes())

840
extra/sqtt/sqtt.h Normal file
View File

@@ -0,0 +1,840 @@
#include <stdint.h>
// Original definition in pal is in c++ and clang2py can't autogen it correctly
// Most of this is copy pasted from mesa/src/amd/common/ac_rgp.{h, c}
/*
* Copyright 2020 Advanced Micro Devices, Inc.
* Copyright 2020 Valve Corporation
*
* SPDX-License-Identifier: MIT
*/
#define SQTT_FILE_MAGIC_NUMBER 0x50303042
#define SQTT_FILE_VERSION_MAJOR 1
#define SQTT_FILE_VERSION_MINOR 5
#define SQTT_GPU_NAME_MAX_SIZE 256
#define SQTT_MAX_NUM_SE 32
#define SQTT_SA_PER_SE 2
#define SQTT_ACTIVE_PIXEL_PACKER_MASK_DWORDS 4
struct sqtt_data_info {
uint32_t cur_offset;
uint32_t trace_status;
union {
uint32_t gfx9_write_counter;
uint32_t gfx10_dropped_cntr;
};
};
struct sqtt_data_se {
struct sqtt_data_info info;
void *data_ptr;
uint32_t shader_engine;
uint32_t compute_unit;
};
enum sqtt_version
{
SQTT_VERSION_NONE = 0x0,
SQTT_VERSION_2_2 = 0x5, /* GFX8 */
SQTT_VERSION_2_3 = 0x6, /* GFX9 */
SQTT_VERSION_2_4 = 0x7, /* GFX10+ */
SQTT_VERSION_3_2 = 0xb, /* GFX11+ */
};
enum sqtt_file_chunk_type
{
SQTT_FILE_CHUNK_TYPE_ASIC_INFO,
SQTT_FILE_CHUNK_TYPE_SQTT_DESC,
SQTT_FILE_CHUNK_TYPE_SQTT_DATA,
SQTT_FILE_CHUNK_TYPE_API_INFO,
SQTT_FILE_CHUNK_TYPE_RESERVED,
SQTT_FILE_CHUNK_TYPE_QUEUE_EVENT_TIMINGS,
SQTT_FILE_CHUNK_TYPE_CLOCK_CALIBRATION,
SQTT_FILE_CHUNK_TYPE_CPU_INFO,
SQTT_FILE_CHUNK_TYPE_SPM_DB,
SQTT_FILE_CHUNK_TYPE_CODE_OBJECT_DATABASE,
SQTT_FILE_CHUNK_TYPE_CODE_OBJECT_LOADER_EVENTS,
SQTT_FILE_CHUNK_TYPE_PSO_CORRELATION,
SQTT_FILE_CHUNK_TYPE_INSTRUMENTATION_TABLE,
SQTT_FILE_CHUNK_TYPE_COUNT
};
struct sqtt_file_chunk_id {
int32_t type : 8;
int32_t index : 8;
int32_t reserved : 16;
};
struct sqtt_file_chunk_header {
struct sqtt_file_chunk_id chunk_id;
uint16_t minor_version;
uint16_t major_version;
int32_t size_in_bytes;
int32_t padding;
};
struct sqtt_file_header_flags {
union {
struct {
uint32_t is_semaphore_queue_timing_etw : 1;
uint32_t no_queue_semaphore_timestamps : 1;
uint32_t reserved : 30;
};
uint32_t value;
};
};
struct sqtt_file_header {
uint32_t magic_number;
uint32_t version_major;
uint32_t version_minor;
struct sqtt_file_header_flags flags;
int32_t chunk_offset;
int32_t second;
int32_t minute;
int32_t hour;
int32_t day_in_month;
int32_t month;
int32_t year;
int32_t day_in_week;
int32_t day_in_year;
int32_t is_daylight_savings;
};
struct sqtt_file_chunk_cpu_info {
struct sqtt_file_chunk_header header;
uint32_t vendor_id[4];
uint32_t processor_brand[12];
uint32_t reserved[2];
uint64_t cpu_timestamp_freq;
uint32_t clock_speed;
uint32_t num_logical_cores;
uint32_t num_physical_cores;
uint32_t system_ram_size;
};
enum sqtt_file_chunk_asic_info_flags
{
SQTT_FILE_CHUNK_ASIC_INFO_FLAG_SC_PACKER_NUMBERING = (1 << 0),
SQTT_FILE_CHUNK_ASIC_INFO_FLAG_PS1_EVENT_TOKENS_ENABLED = (1 << 1)
};
enum sqtt_gpu_type
{
SQTT_GPU_TYPE_UNKNOWN = 0x0,
SQTT_GPU_TYPE_INTEGRATED = 0x1,
SQTT_GPU_TYPE_DISCRETE = 0x2,
SQTT_GPU_TYPE_VIRTUAL = 0x3
};
enum sqtt_gfxip_level
{
SQTT_GFXIP_LEVEL_NONE = 0x0,
SQTT_GFXIP_LEVEL_GFXIP_6 = 0x1,
SQTT_GFXIP_LEVEL_GFXIP_7 = 0x2,
SQTT_GFXIP_LEVEL_GFXIP_8 = 0x3,
SQTT_GFXIP_LEVEL_GFXIP_8_1 = 0x4,
SQTT_GFXIP_LEVEL_GFXIP_9 = 0x5,
SQTT_GFXIP_LEVEL_GFXIP_10_1 = 0x7,
SQTT_GFXIP_LEVEL_GFXIP_10_3 = 0x9,
SQTT_GFXIP_LEVEL_GFXIP_11_0 = 0xc,
};
enum sqtt_memory_type
{
SQTT_MEMORY_TYPE_UNKNOWN = 0x0,
SQTT_MEMORY_TYPE_DDR = 0x1,
SQTT_MEMORY_TYPE_DDR2 = 0x2,
SQTT_MEMORY_TYPE_DDR3 = 0x3,
SQTT_MEMORY_TYPE_DDR4 = 0x4,
SQTT_MEMORY_TYPE_DDR5 = 0x5,
SQTT_MEMORY_TYPE_GDDR3 = 0x10,
SQTT_MEMORY_TYPE_GDDR4 = 0x11,
SQTT_MEMORY_TYPE_GDDR5 = 0x12,
SQTT_MEMORY_TYPE_GDDR6 = 0x13,
SQTT_MEMORY_TYPE_HBM = 0x20,
SQTT_MEMORY_TYPE_HBM2 = 0x21,
SQTT_MEMORY_TYPE_HBM3 = 0x22,
SQTT_MEMORY_TYPE_LPDDR4 = 0x30,
SQTT_MEMORY_TYPE_LPDDR5 = 0x31,
};
struct sqtt_file_chunk_asic_info {
struct sqtt_file_chunk_header header;
uint64_t flags;
uint64_t trace_shader_core_clock;
uint64_t trace_memory_clock;
int32_t device_id;
int32_t device_revision_id;
int32_t vgprs_per_simd;
int32_t sgprs_per_simd;
int32_t shader_engines;
int32_t compute_unit_per_shader_engine;
int32_t simd_per_compute_unit;
int32_t wavefronts_per_simd;
int32_t minimum_vgpr_alloc;
int32_t vgpr_alloc_granularity;
int32_t minimum_sgpr_alloc;
int32_t sgpr_alloc_granularity;
int32_t hardware_contexts;
enum sqtt_gpu_type gpu_type;
enum sqtt_gfxip_level gfxip_level;
int32_t gpu_index;
int32_t gds_size;
int32_t gds_per_shader_engine;
int32_t ce_ram_size;
int32_t ce_ram_size_graphics;
int32_t ce_ram_size_compute;
int32_t max_number_of_dedicated_cus;
int64_t vram_size;
int32_t vram_bus_width;
int32_t l2_cache_size;
int32_t l1_cache_size;
int32_t lds_size;
char gpu_name[SQTT_GPU_NAME_MAX_SIZE];
float alu_per_clock;
float texture_per_clock;
float prims_per_clock;
float pixels_per_clock;
uint64_t gpu_timestamp_frequency;
uint64_t max_shader_core_clock;
uint64_t max_memory_clock;
uint32_t memory_ops_per_clock;
enum sqtt_memory_type memory_chip_type;
uint32_t lds_granularity;
uint16_t cu_mask[SQTT_MAX_NUM_SE][SQTT_SA_PER_SE];
char reserved1[128];
uint32_t active_pixel_packer_mask[SQTT_ACTIVE_PIXEL_PACKER_MASK_DWORDS];
char reserved2[16];
uint32_t gl1_cache_size;
uint32_t instruction_cache_size;
uint32_t scalar_cache_size;
uint32_t mall_cache_size;
char padding[4];
};
enum sqtt_api_type
{
SQTT_API_TYPE_DIRECTX_12,
SQTT_API_TYPE_VULKAN,
SQTT_API_TYPE_GENERIC,
SQTT_API_TYPE_OPENCL
};
enum sqtt_instruction_trace_mode
{
SQTT_INSTRUCTION_TRACE_DISABLED = 0x0,
SQTT_INSTRUCTION_TRACE_FULL_FRAME = 0x1,
SQTT_INSTRUCTION_TRACE_API_PSO = 0x2,
};
enum sqtt_profiling_mode
{
SQTT_PROFILING_MODE_PRESENT = 0x0,
SQTT_PROFILING_MODE_USER_MARKERS = 0x1,
SQTT_PROFILING_MODE_INDEX = 0x2,
SQTT_PROFILING_MODE_TAG = 0x3,
};
union sqtt_profiling_mode_data {
struct {
char start[256];
char end[256];
} user_marker_profiling_data;
struct {
uint32_t start;
uint32_t end;
} index_profiling_data;
struct {
uint32_t begin_hi;
uint32_t begin_lo;
uint32_t end_hi;
uint32_t end_lo;
} tag_profiling_data;
};
union sqtt_instruction_trace_data {
struct {
uint64_t api_pso_filter;
} api_pso_data;
struct {
uint32_t mask;
} shader_engine_filter;
};
struct sqtt_file_chunk_api_info {
struct sqtt_file_chunk_header header;
enum sqtt_api_type api_type;
uint16_t major_version;
uint16_t minor_version;
enum sqtt_profiling_mode profiling_mode;
uint32_t reserved;
union sqtt_profiling_mode_data profiling_mode_data;
enum sqtt_instruction_trace_mode instruction_trace_mode;
uint32_t reserved2;
union sqtt_instruction_trace_data instruction_trace_data;
};
struct sqtt_code_object_database_record {
uint32_t size;
};
struct sqtt_file_chunk_code_object_database {
struct sqtt_file_chunk_header header;
uint32_t offset;
uint32_t flags;
uint32_t size;
uint32_t record_count;
};
struct sqtt_code_object_loader_events_record {
uint32_t loader_event_type;
uint32_t reserved;
uint64_t base_address;
uint64_t code_object_hash[2];
uint64_t time_stamp;
};
struct sqtt_file_chunk_code_object_loader_events {
struct sqtt_file_chunk_header header;
uint32_t offset;
uint32_t flags;
uint32_t record_size;
uint32_t record_count;
};
struct sqtt_pso_correlation_record {
uint64_t api_pso_hash;
uint64_t pipeline_hash[2];
char api_level_obj_name[64];
};
struct sqtt_file_chunk_pso_correlation {
struct sqtt_file_chunk_header header;
uint32_t offset;
uint32_t flags;
uint32_t record_size;
uint32_t record_count;
};
struct sqtt_file_chunk_sqtt_desc {
struct sqtt_file_chunk_header header;
int32_t shader_engine_index;
enum sqtt_version sqtt_version;
union {
struct {
int32_t instrumentation_version;
} v0;
struct {
int16_t instrumentation_spec_version;
int16_t instrumentation_api_version;
int32_t compute_unit_index;
} v1;
};
};
struct sqtt_file_chunk_sqtt_data {
struct sqtt_file_chunk_header header;
int32_t offset; /* in bytes */
int32_t size; /* in bytes */
};
struct sqtt_file_chunk_queue_event_timings {
struct sqtt_file_chunk_header header;
uint32_t queue_info_table_record_count;
uint32_t queue_info_table_size;
uint32_t queue_event_table_record_count;
uint32_t queue_event_table_size;
};
enum sqtt_queue_type {
SQTT_QUEUE_TYPE_UNKNOWN = 0x0,
SQTT_QUEUE_TYPE_UNIVERSAL = 0x1,
SQTT_QUEUE_TYPE_COMPUTE = 0x2,
SQTT_QUEUE_TYPE_DMA = 0x3,
};
enum sqtt_engine_type {
SQTT_ENGINE_TYPE_UNKNOWN = 0x0,
SQTT_ENGINE_TYPE_UNIVERSAL = 0x1,
SQTT_ENGINE_TYPE_COMPUTE = 0x2,
SQTT_ENGINE_TYPE_EXCLUSIVE_COMPUTE = 0x3,
SQTT_ENGINE_TYPE_DMA = 0x4,
SQTT_ENGINE_TYPE_HIGH_PRIORITY_UNIVERSAL = 0x7,
SQTT_ENGINE_TYPE_HIGH_PRIORITY_GRAPHICS = 0x8,
};
struct sqtt_queue_hardware_info {
union {
struct {
int32_t queue_type : 8;
int32_t engine_type : 8;
uint32_t reserved : 16;
};
uint32_t value;
};
};
struct sqtt_queue_info_record {
uint64_t queue_id;
uint64_t queue_context;
struct sqtt_queue_hardware_info hardware_info;
uint32_t reserved;
};
enum sqtt_queue_event_type {
SQTT_QUEUE_TIMING_EVENT_CMDBUF_SUBMIT,
SQTT_QUEUE_TIMING_EVENT_SIGNAL_SEMAPHORE,
SQTT_QUEUE_TIMING_EVENT_WAIT_SEMAPHORE,
SQTT_QUEUE_TIMING_EVENT_PRESENT
};
struct sqtt_queue_event_record {
enum sqtt_queue_event_type event_type;
uint32_t sqtt_cb_id;
uint64_t frame_index;
uint32_t queue_info_index;
uint32_t submit_sub_index;
uint64_t api_id;
uint64_t cpu_timestamp;
uint64_t gpu_timestamps[2];
};
struct sqtt_file_chunk_clock_calibration {
struct sqtt_file_chunk_header header;
uint64_t cpu_timestamp;
uint64_t gpu_timestamp;
uint64_t reserved;
};
enum elf_gfxip_level
{
EF_AMDGPU_MACH_AMDGCN_GFX801 = 0x028,
EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c,
EF_AMDGPU_MACH_AMDGCN_GFX1010 = 0x033,
EF_AMDGPU_MACH_AMDGCN_GFX1030 = 0x036,
EF_AMDGPU_MACH_AMDGCN_GFX1100 = 0x041,
};
struct sqtt_file_chunk_spm_db {
struct sqtt_file_chunk_header header;
uint32_t flags;
uint32_t preamble_size;
uint32_t num_timestamps;
uint32_t num_spm_counter_info;
uint32_t spm_counter_info_size;
uint32_t sample_interval;
};
/**
* Identifiers for RGP SQ thread-tracing markers (Table 1)
*/
enum rgp_sqtt_marker_identifier
{
RGP_SQTT_MARKER_IDENTIFIER_EVENT = 0x0,
RGP_SQTT_MARKER_IDENTIFIER_CB_START = 0x1,
RGP_SQTT_MARKER_IDENTIFIER_CB_END = 0x2,
RGP_SQTT_MARKER_IDENTIFIER_BARRIER_START = 0x3,
RGP_SQTT_MARKER_IDENTIFIER_BARRIER_END = 0x4,
RGP_SQTT_MARKER_IDENTIFIER_USER_EVENT = 0x5,
RGP_SQTT_MARKER_IDENTIFIER_GENERAL_API = 0x6,
RGP_SQTT_MARKER_IDENTIFIER_SYNC = 0x7,
RGP_SQTT_MARKER_IDENTIFIER_PRESENT = 0x8,
RGP_SQTT_MARKER_IDENTIFIER_LAYOUT_TRANSITION = 0x9,
RGP_SQTT_MARKER_IDENTIFIER_RENDER_PASS = 0xA,
RGP_SQTT_MARKER_IDENTIFIER_RESERVED2 = 0xB,
RGP_SQTT_MARKER_IDENTIFIER_BIND_PIPELINE = 0xC,
RGP_SQTT_MARKER_IDENTIFIER_RESERVED4 = 0xD,
RGP_SQTT_MARKER_IDENTIFIER_RESERVED5 = 0xE,
RGP_SQTT_MARKER_IDENTIFIER_RESERVED6 = 0xF
};
/**
* Command buffer IDs used in RGP SQ thread-tracing markers (only 20 bits).
*/
union rgp_sqtt_marker_cb_id {
struct {
uint32_t per_frame : 1; /* Must be 1, frame-based command buffer ID. */
uint32_t frame_index : 7;
uint32_t cb_index : 12; /* Command buffer index within the frame. */
uint32_t reserved : 12;
} per_frame_cb_id;
struct {
uint32_t per_frame : 1; /* Must be 0, global command buffer ID. */
uint32_t cb_index : 19; /* Global command buffer index. */
uint32_t reserved : 12;
} global_cb_id;
uint32_t all;
};
/**
* RGP SQ thread-tracing marker for the start of a command buffer. (Table 2)
*/
struct rgp_sqtt_marker_cb_start {
union {
struct {
uint32_t identifier : 4;
uint32_t ext_dwords : 3;
uint32_t cb_id : 20;
uint32_t queue : 5;
};
uint32_t dword01;
};
union {
uint32_t device_id_low;
uint32_t dword02;
};
union {
uint32_t device_id_high;
uint32_t dword03;
};
union {
uint32_t queue_flags;
uint32_t dword04;
};
};
/**
*
* RGP SQ thread-tracing marker for the end of a command buffer. (Table 3)
*/
struct rgp_sqtt_marker_cb_end {
union {
struct {
uint32_t identifier : 4;
uint32_t ext_dwords : 3;
uint32_t cb_id : 20;
uint32_t reserved : 5;
};
uint32_t dword01;
};
union {
uint32_t device_id_low;
uint32_t dword02;
};
union {
uint32_t device_id_high;
uint32_t dword03;
};
};
/**
* API types used in RGP SQ thread-tracing markers for the "General API"
* packet.
*/
enum rgp_sqtt_marker_general_api_type
{
ApiCmdBindPipeline = 0,
ApiCmdBindDescriptorSets = 1,
ApiCmdBindIndexBuffer = 2,
ApiCmdBindVertexBuffers = 3,
ApiCmdDraw = 4,
ApiCmdDrawIndexed = 5,
ApiCmdDrawIndirect = 6,
ApiCmdDrawIndexedIndirect = 7,
ApiCmdDrawIndirectCountAMD = 8,
ApiCmdDrawIndexedIndirectCountAMD = 9,
ApiCmdDispatch = 10,
ApiCmdDispatchIndirect = 11,
ApiCmdCopyBuffer = 12,
ApiCmdCopyImage = 13,
ApiCmdBlitImage = 14,
ApiCmdCopyBufferToImage = 15,
ApiCmdCopyImageToBuffer = 16,
ApiCmdUpdateBuffer = 17,
ApiCmdFillBuffer = 18,
ApiCmdClearColorImage = 19,
ApiCmdClearDepthStencilImage = 20,
ApiCmdClearAttachments = 21,
ApiCmdResolveImage = 22,
ApiCmdWaitEvents = 23,
ApiCmdPipelineBarrier = 24,
ApiCmdBeginQuery = 25,
ApiCmdEndQuery = 26,
ApiCmdResetQueryPool = 27,
ApiCmdWriteTimestamp = 28,
ApiCmdCopyQueryPoolResults = 29,
ApiCmdPushConstants = 30,
ApiCmdBeginRenderPass = 31,
ApiCmdNextSubpass = 32,
ApiCmdEndRenderPass = 33,
ApiCmdExecuteCommands = 34,
ApiCmdSetViewport = 35,
ApiCmdSetScissor = 36,
ApiCmdSetLineWidth = 37,
ApiCmdSetDepthBias = 38,
ApiCmdSetBlendConstants = 39,
ApiCmdSetDepthBounds = 40,
ApiCmdSetStencilCompareMask = 41,
ApiCmdSetStencilWriteMask = 42,
ApiCmdSetStencilReference = 43,
ApiCmdDrawIndirectCount = 44,
ApiCmdDrawIndexedIndirectCount = 45,
/* gap */
ApiCmdDrawMeshTasksEXT = 47,
ApiCmdDrawMeshTasksIndirectCountEXT = 48,
ApiCmdDrawMeshTasksIndirectEXT = 49,
ApiRayTracingSeparateCompiled = 0x800000,
ApiInvalid = 0xffffffff
};
/**
* RGP SQ thread-tracing marker for a "General API" instrumentation packet.
*/
struct rgp_sqtt_marker_general_api {
union {
struct {
uint32_t identifier : 4;
uint32_t ext_dwords : 3;
uint32_t api_type : 20;
uint32_t is_end : 1;
uint32_t reserved : 4;
};
uint32_t dword01;
};
};
/**
* API types used in RGP SQ thread-tracing markers (Table 16).
*/
enum rgp_sqtt_marker_event_type
{
EventCmdDraw = 0,
EventCmdDrawIndexed = 1,
EventCmdDrawIndirect = 2,
EventCmdDrawIndexedIndirect = 3,
EventCmdDrawIndirectCountAMD = 4,
EventCmdDrawIndexedIndirectCountAMD = 5,
EventCmdDispatch = 6,
EventCmdDispatchIndirect = 7,
EventCmdCopyBuffer = 8,
EventCmdCopyImage = 9,
EventCmdBlitImage = 10,
EventCmdCopyBufferToImage = 11,
EventCmdCopyImageToBuffer = 12,
EventCmdUpdateBuffer = 13,
EventCmdFillBuffer = 14,
EventCmdClearColorImage = 15,
EventCmdClearDepthStencilImage = 16,
EventCmdClearAttachments = 17,
EventCmdResolveImage = 18,
EventCmdWaitEvents = 19,
EventCmdPipelineBarrier = 20,
EventCmdResetQueryPool = 21,
EventCmdCopyQueryPoolResults = 22,
EventRenderPassColorClear = 23,
EventRenderPassDepthStencilClear = 24,
EventRenderPassResolve = 25,
EventInternalUnknown = 26,
EventCmdDrawIndirectCount = 27,
EventCmdDrawIndexedIndirectCount = 28,
/* gap */
EventCmdTraceRaysKHR = 30,
EventCmdTraceRaysIndirectKHR = 31,
EventCmdBuildAccelerationStructuresKHR = 32,
EventCmdBuildAccelerationStructuresIndirectKHR = 33,
EventCmdCopyAccelerationStructureKHR = 34,
EventCmdCopyAccelerationStructureToMemoryKHR = 35,
EventCmdCopyMemoryToAccelerationStructureKHR = 36,
/* gap */
EventCmdDrawMeshTasksEXT = 41,
EventCmdDrawMeshTasksIndirectCountEXT = 42,
EventCmdDrawMeshTasksIndirectEXT = 43,
EventUnknown = 0x7fff,
EventInvalid = 0xffffffff
};
/**
* "Event (Per-draw/dispatch)" RGP SQ thread-tracing marker. (Table 4)
*/
struct rgp_sqtt_marker_event {
union {
struct {
uint32_t identifier : 4;
uint32_t ext_dwords : 3;
uint32_t api_type : 24;
uint32_t has_thread_dims : 1;
};
uint32_t dword01;
};
union {
struct {
uint32_t cb_id : 20;
uint32_t vertex_offset_reg_idx : 4;
uint32_t instance_offset_reg_idx : 4;
uint32_t draw_index_reg_idx : 4;
};
uint32_t dword02;
};
union {
uint32_t cmd_id;
uint32_t dword03;
};
};
/**
* Per-dispatch specific marker where workgroup dims are included.
*/
struct rgp_sqtt_marker_event_with_dims {
struct rgp_sqtt_marker_event event;
uint32_t thread_x;
uint32_t thread_y;
uint32_t thread_z;
};
/**
* "Barrier Start" RGP SQTT instrumentation marker (Table 5)
*/
struct rgp_sqtt_marker_barrier_start {
union {
struct {
uint32_t identifier : 4;
uint32_t ext_dwords : 3;
uint32_t cb_id : 20;
uint32_t reserved : 5;
};
uint32_t dword01;
};
union {
struct {
uint32_t driver_reason : 31;
uint32_t internal : 1;
};
uint32_t dword02;
};
};
/**
* "Barrier End" RGP SQTT instrumentation marker (Table 6)
*/
struct rgp_sqtt_marker_barrier_end {
union {
struct {
uint32_t identifier : 4;
uint32_t ext_dwords : 3;
uint32_t cb_id : 20;
uint32_t wait_on_eop_ts : 1;
uint32_t vs_partial_flush : 1;
uint32_t ps_partial_flush : 1;
uint32_t cs_partial_flush : 1;
uint32_t pfp_sync_me : 1;
};
uint32_t dword01;
};
union {
struct {
uint32_t sync_cp_dma : 1;
uint32_t inval_tcp : 1;
uint32_t inval_sqI : 1;
uint32_t inval_sqK : 1;
uint32_t flush_tcc : 1;
uint32_t inval_tcc : 1;
uint32_t flush_cb : 1;
uint32_t inval_cb : 1;
uint32_t flush_db : 1;
uint32_t inval_db : 1;
uint32_t num_layout_transitions : 16;
uint32_t inval_gl1 : 1;
uint32_t wait_on_ts : 1;
uint32_t eop_ts_bottom_of_pipe : 1;
uint32_t eos_ts_ps_done : 1;
uint32_t eos_ts_cs_done : 1;
uint32_t reserved : 1;
};
uint32_t dword02;
};
};
/**
* "Layout Transition" RGP SQTT instrumentation marker (Table 7)
*/
struct rgp_sqtt_marker_layout_transition {
union {
struct {
uint32_t identifier : 4;
uint32_t ext_dwords : 3;
uint32_t depth_stencil_expand : 1;
uint32_t htile_hiz_range_expand : 1;
uint32_t depth_stencil_resummarize : 1;
uint32_t dcc_decompress : 1;
uint32_t fmask_decompress : 1;
uint32_t fast_clear_eliminate : 1;
uint32_t fmask_color_expand : 1;
uint32_t init_mask_ram : 1;
uint32_t reserved1 : 17;
};
uint32_t dword01;
};
union {
struct {
uint32_t reserved2 : 32;
};
uint32_t dword02;
};
};
/**
* "User Event" RGP SQTT instrumentation marker (Table 8)
*/
struct rgp_sqtt_marker_user_event {
union {
struct {
uint32_t identifier : 4;
uint32_t reserved0 : 8;
uint32_t data_type : 8;
uint32_t reserved1 : 12;
};
uint32_t dword01;
};
};
struct rgp_sqtt_marker_user_event_with_length {
struct rgp_sqtt_marker_user_event user_event;
uint32_t length;
};
enum rgp_sqtt_marker_user_event_type
{
UserEventTrigger = 0,
UserEventPop,
UserEventPush,
UserEventObjectName,
};
/**
* "Pipeline bind" RGP SQTT instrumentation marker (Table 12)
*/
struct rgp_sqtt_marker_pipeline_bind {
union {
struct {
uint32_t identifier : 4;
uint32_t ext_dwords : 3;
uint32_t bind_point : 1;
uint32_t cb_id : 20;
uint32_t reserved : 4;
};
uint32_t dword01;
};
union {
uint32_t api_pso_hash[2];
struct {
uint32_t dword02;
uint32_t dword03;
};
};
};

View File

@@ -57,6 +57,9 @@ class ProfileDeviceEvent(ProfileEvent):
@dataclass(frozen=True)
class ProfileRangeEvent(ProfileEvent): device:str; name:str; st:decimal.Decimal; en:decimal.Decimal; is_copy:bool # noqa: E702
@dataclass(frozen=True)
class ProfileProgramEvent(ProfileEvent): device:str; name:str; lib:bytes|None; base:int|None # noqa: E702
@dataclass(frozen=True)
class ProfileGraphEntry: device:str; name:str; st_id:int; en_id:int; is_copy:bool # noqa: E702
@@ -342,8 +345,9 @@ if PROFILE:
with open(fn:=temp("profile.pkl", append_user=True), "wb") as f: pickle.dump(Compiled.profile_events, f)
from tinygrad.ops import launch_viz
launch_viz("PROFILE", fn)
if not getenv("SQTT", 0):
from tinygrad.ops import launch_viz
launch_viz("PROFILE", fn)
if __name__ == "__main__":
for device in ALL_DEVICES:

File diff suppressed because it is too large Load Diff

View File

@@ -1,15 +1,15 @@
from __future__ import annotations
from typing import Any, cast, ClassVar
import os, ctypes, ctypes.util, functools, mmap, errno, array, contextlib, sys, select
import os, ctypes, ctypes.util, struct, hashlib, functools, mmap, errno, array, contextlib, sys, select
assert sys.platform != 'win32'
from dataclasses import dataclass
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQSignal, HCQProgram, HWInterface
from tinygrad.ops import sint
from tinygrad.device import BufferSpec, CPUProgram
from tinygrad.device import Compiled, ProfileEvent, BufferSpec, CPUProgram, PROFILE
from tinygrad.helpers import getenv, to_mv, round_up, data64_le, mv_address, DEBUG, OSX
from tinygrad.renderer.cstyle import AMDRenderer
from tinygrad.runtime.autogen import kfd, hsa, amd_gpu, libc, pci, vfio
from tinygrad.runtime.autogen.am import am
from tinygrad.runtime.autogen import kfd, hsa, amd_gpu, libc, pci, vfio, sqtt
from tinygrad.runtime.autogen.am import am, gc_11_0_0
from tinygrad.runtime.support.compiler_hip import AMDCompiler
from tinygrad.runtime.support.elf import elf_loader
from tinygrad.runtime.support.am.amdev import AMDev, AMMapping
@@ -18,13 +18,21 @@ if getenv("IOCTL"): import extra.hip_gpu_driver.hip_ioctl # noqa: F401 # pylint
regBIF_BX_PF1_GPU_HDP_FLUSH_REQ, regBIF_BX_PF1_GPU_HDP_FLUSH_DONE = 0x0106, 0x0107
EVENT_INDEX_PARTIAL_FLUSH = 4 # based on a comment in nvd.h
WAIT_REG_MEM_FUNCTION_EQ = 3 # ==
WAIT_REG_MEM_FUNCTION_NEQ = 4 # !=
WAIT_REG_MEM_FUNCTION_GEQ = 5 # >=
COMPUTE_SHADER_EN, FORCE_START_AT_000, CS_W32_EN = (1 << 0), (1 << 2), (1 << 15)
def gfxreg(reg): return reg + amd_gpu.GC_BASE__INST0_SEG0 - amd_gpu.PACKET3_SET_SH_REG_START
def ucfgreg(reg, pkt3_set:bool=True): return reg + amd_gpu.GC_BASE__INST0_SEG1 - (amd_gpu.PACKET3_SET_UCONFIG_REG_START if pkt3_set else 0)
def nbioreg(reg): return reg + amd_gpu.NBIO_BASE__INST0_SEG2
# This can potentially be shared with AMRegister._parse_kwargs. NOTE: This is hardcoded to gfx11, bitfields might be different in other gfxvers.
# Currently not a problem because this is only used by sqtt and sqtt is only supported on 7900xtx
def encode_bitfields(regname: str, **kwargs) -> int:
return functools.reduce(lambda x,y: x|y, [v << getattr(gc_11_0_0, f'{regname}__{k.upper()}__SHIFT') for k,v in kwargs.items()], 0)
class AMDSignal(HCQSignal):
def __init__(self, base_addr:int|None=None, **kwargs):
super().__init__(base_addr, **kwargs, timestamp_divider=100, dev_t=AMDDevice)
@@ -40,6 +48,11 @@ class AMDComputeQueue(HWQueue):
def pkt3(self, cmd, *vals): self.q(amd_gpu.PACKET3(cmd, len(vals) - 1), *vals)
def sqtt_userdata(self, data, *extra_dwords):
data_ints = [x[0] for x in struct.iter_unpack('<I', bytes(data))] + list(extra_dwords)
for i in range(0, len(data_ints), 2):
self.pkt3(amd_gpu.PACKET3_SET_UCONFIG_REG, ucfgreg(amd_gpu.regSQ_THREAD_TRACE_USERDATA_2), *data_ints[i:i+2])
def wait_reg_mem(self, value, mask=0xffffffff, mem=None, reg_req=None, reg_done=None):
wrm_info_dw = amd_gpu.WAIT_REG_MEM_MEM_SPACE(int(mem is not None)) | amd_gpu.WAIT_REG_MEM_OPERATION(int(mem is None)) \
| amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_GEQ) | amd_gpu.WAIT_REG_MEM_ENGINE(0)
@@ -72,6 +85,83 @@ class AMDComputeQueue(HWQueue):
self.acquire_mem()
return self
def spi_config(self, tracing:bool):
spi_config_cntl = encode_bitfields('SPI_CONFIG_CNTL', ps_pkr_priority_cntl=3, exp_priority_order=3, gpr_write_priority=0x2c688,
enable_sqg_bop_events=int(tracing), enable_sqg_top_events=int(tracing))
self.pkt3(amd_gpu.PACKET3_SET_UCONFIG_REG, ucfgreg(amd_gpu.regSPI_CONFIG_CNTL), spi_config_cntl)
def sqtt_config(self, tracing:bool):
sq_thread_trace_ctrl = encode_bitfields('SQ_THREAD_TRACE_CTRL', draw_event_en=1, spi_stall_en=1, sq_stall_en=1, reg_at_hwm=2, hiwater=1,
rt_freq=amd_gpu.SQ_TT_RT_FREQ_4096_CLK, util_timer=amd_gpu.SQ_TT_UTIL_TIMER_250_CLK, mode=int(tracing))
self.pkt3(amd_gpu.PACKET3_SET_UCONFIG_REG, ucfgreg(amd_gpu.regSQ_THREAD_TRACE_CTRL), sq_thread_trace_ctrl)
def grbm_gfx_index(self, **kwargs):
self.pkt3(amd_gpu.PACKET3_SET_UCONFIG_REG, ucfgreg(amd_gpu.regGRBM_GFX_INDEX), encode_bitfields('GRBM_GFX_INDEX', **kwargs))
# Magic values from mesa/src/amd/vulkan/radv_sqtt.c:radv_emit_spi_config_cntl and src/amd/common/ac_sqtt.c:ac_sqtt_emit_start
def start_trace(self, buf0s:list[HCQBuffer], se_mask:int):
self.memory_barrier()
self.spi_config(tracing=True)
# One buffer for one SE, mesa does it with a single buffer and ac_sqtt_get_data_offset, but this is simpler and should work just as well
for se in range(len(buf0s)):
self.grbm_gfx_index(se_index=se, instance_broadcast_writes=1) # select se, broadcast to all instances in that se
buf0_lo, buf0_hi = data64_le(buf0s[se].va_addr>>12)
self.pkt3(amd_gpu.PACKET3_SET_UCONFIG_REG, ucfgreg(amd_gpu.regSQ_THREAD_TRACE_BUF0_SIZE),
encode_bitfields('SQ_THREAD_TRACE_BUF0_SIZE', base_hi=buf0_hi, size=buf0s[se].size>>12))
self.pkt3(amd_gpu.PACKET3_SET_UCONFIG_REG, ucfgreg(amd_gpu.regSQ_THREAD_TRACE_BUF0_BASE), buf0_lo)
# NOTE: SQTT can only trace instructions on one simd per se, this selects first simd in first wgp in first sa.
# For RGP to display instruction trace it has to see it on first SE. Howerver ACE/MEC/whatever does the dispatching starting with second se,
# and on amdgpu/non-AM it also does weird things with dispatch order inside se: around 7 times out of 10 it starts from the last cu, but
# sometimes not, especially if the kernel has more than one wavefront which means that kernels with small global size might get unlucky and
# be dispatched on something else and not be seen in instruction tracing tab. You can force the wavefronts of a kernel to be dispatched on the
# CUs you want to by disabling other CUs via bits in regCOMPUTE_STATIC_THREAD_MGMT_SE<x> and trace even kernels that only have one wavefront.
self.pkt3(amd_gpu.PACKET3_SET_UCONFIG_REG, ucfgreg(amd_gpu.regSQ_THREAD_TRACE_MASK),
encode_bitfields('SQ_THREAD_TRACE_MASK', wtype_include=amd_gpu.SQ_TT_WTYPE_INCLUDE_CS_BIT, simd_sel=0, wgp_sel=0, sa_sel=0))
REG_INCLUDE = amd_gpu.SQ_TT_TOKEN_MASK_SQDEC_BIT | amd_gpu.SQ_TT_TOKEN_MASK_SHDEC_BIT | amd_gpu.SQ_TT_TOKEN_MASK_GFXUDEC_BIT | \
amd_gpu.SQ_TT_TOKEN_MASK_COMP_BIT | amd_gpu.SQ_TT_TOKEN_MASK_CONTEXT_BIT | amd_gpu.SQ_TT_TOKEN_MASK_CONTEXT_BIT
TOKEN_EXCLUDE = 1 << amd_gpu.SQ_TT_TOKEN_EXCLUDE_PERF_SHIFT
if not (se_mask >> se) & 0b1:
TOKEN_EXCLUDE |= 1 << amd_gpu.SQ_TT_TOKEN_EXCLUDE_VMEMEXEC_SHIFT | 1 << amd_gpu.SQ_TT_TOKEN_EXCLUDE_ALUEXEC_SHIFT | \
1 << amd_gpu.SQ_TT_TOKEN_EXCLUDE_VALUINST_SHIFT | 1 << amd_gpu.SQ_TT_TOKEN_EXCLUDE_IMMEDIATE_SHIFT | \
1 << amd_gpu.SQ_TT_TOKEN_EXCLUDE_INST_SHIFT
self.pkt3(amd_gpu.PACKET3_SET_UCONFIG_REG, ucfgreg(amd_gpu.regSQ_THREAD_TRACE_TOKEN_MASK),
encode_bitfields('SQ_THREAD_TRACE_TOKEN_MASK', reg_include=REG_INCLUDE, token_exclude=TOKEN_EXCLUDE, bop_events_token_include=1))
# Enable SQTT
self.sqtt_config(tracing=True)
# Restore global broadcasting
self.grbm_gfx_index(se_broadcast_writes=1, sa_broadcast_writes=1, instance_broadcast_writes=1)
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_THREAD_TRACE_ENABLE), 1)
self.memory_barrier()
return self
# Magic values from src/amd/common/ac_sqtt.c:ac_sqtt_emit_stop and src/amd/common/ac_sqtt.c:ac_sqtt_emit_wait
def stop_trace(self, ses: int, wptrs: HCQBuffer):
self.memory_barrier()
# Start shutting everything down
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_THREAD_TRACE_ENABLE), 0)
self.pkt3(amd_gpu.PACKET3_EVENT_WRITE, amd_gpu.EVENT_TYPE(amd_gpu.THREAD_TRACE_FINISH) | amd_gpu.EVENT_INDEX(0))
# For each SE wait for finish to complete and copy regSQ_THREAD_TRACE_WPTR to know where in the buffer trace data ends
for se in range(ses):
self.grbm_gfx_index(se_index=se, instance_broadcast_writes=1) # select se, broadcast to all instances in that se
# Wait for FINISH_PENDING==0
self.pkt3(amd_gpu.PACKET3_WAIT_REG_MEM, amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_EQ),
ucfgreg(amd_gpu.regSQ_THREAD_TRACE_STATUS, False), 0, 0, gc_11_0_0.SQ_THREAD_TRACE_STATUS__FINISH_PENDING_MASK, 4)
# Wait for FINISH_DONE!=0
self.pkt3(amd_gpu.PACKET3_WAIT_REG_MEM, amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_NEQ),
ucfgreg(amd_gpu.regSQ_THREAD_TRACE_STATUS, False), 0, 0, gc_11_0_0.SQ_THREAD_TRACE_STATUS__FINISH_DONE_MASK, 4)
# Disable SQTT
self.sqtt_config(tracing=False)
# Wait for BUSY==0
self.pkt3(amd_gpu.PACKET3_WAIT_REG_MEM, amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_EQ),
ucfgreg(amd_gpu.regSQ_THREAD_TRACE_STATUS, False), 0, 0, gc_11_0_0.SQ_THREAD_TRACE_STATUS__BUSY_MASK, 4)
# Copy WPTR to memory (src_sel = perf, dst_sel = tc_l2, wr_confirm = True), ucfgreg with False adds GC_BASE__INST0_SEG1 but not pkt3 reg offset
self.pkt3(amd_gpu.PACKET3_COPY_DATA, 1 << 20 | 2 << 8 | 4, ucfgreg(amd_gpu.regSQ_THREAD_TRACE_WPTR, False), 0, *data64_le(wptrs.va_addr+(se*4)))
# Restore global broadcasting
self.grbm_gfx_index(se_broadcast_writes=1, sa_broadcast_writes=1, instance_broadcast_writes=1)
self.spi_config(tracing=False)
self.memory_barrier()
return self
def exec(self, prg:AMDProgram, args_state:CLikeArgsState, global_size:tuple[sint, ...], local_size:tuple[sint, ...]):
self.bind_args_state(args_state)
@@ -93,6 +183,20 @@ class AMDComputeQueue(HWQueue):
user_regs += [*data64_le(args_state.ptr)]
if prg.dev.sqtt_enabled:
self.sqtt_userdata(sqtt.struct_rgp_sqtt_marker_pipeline_bind(
_0=sqtt.union_rgp_sqtt_marker_pipeline_bind_0(_0=sqtt.struct_rgp_sqtt_marker_pipeline_bind_0_0(
identifier=sqtt.RGP_SQTT_MARKER_IDENTIFIER_BIND_PIPELINE,
bind_point=1, # compute
)),
_1=sqtt.union_rgp_sqtt_marker_pipeline_bind_1(api_pso_hash=data64_le(prg.libhash[0])),
))
self.sqtt_userdata(sqtt.struct_rgp_sqtt_marker_event(
_0=sqtt.union_rgp_sqtt_marker_event_0(_0=sqtt.struct_rgp_sqtt_marker_event_0_0(has_thread_dims=1)),
_2=sqtt.union_rgp_sqtt_marker_event_2(cmd_id=prg.dev.cmd_id),
), *global_size)
prg.dev.cmd_id += 1
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_PGM_LO), *data64_le(prg.prog_addr >> 8))
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_PGM_RSRC1), prg.rsrc1, prg.rsrc2)
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_PGM_RSRC3), 0)
@@ -110,6 +214,7 @@ class AMDComputeQueue(HWQueue):
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_RESOURCE_LIMITS), 0)
self.pkt3(amd_gpu.PACKET3_DISPATCH_DIRECT, *global_size, CS_W32_EN | FORCE_START_AT_000 | COMPUTE_SHADER_EN)
if prg.dev.sqtt_enabled: self.pkt3(amd_gpu.PACKET3_EVENT_WRITE, amd_gpu.EVENT_TYPE(amd_gpu.THREAD_TRACE_MARKER) | amd_gpu.EVENT_INDEX(0))
self.pkt3(amd_gpu.PACKET3_EVENT_WRITE, amd_gpu.EVENT_TYPE(amd_gpu.CS_PARTIAL_FLUSH) | amd_gpu.EVENT_INDEX(EVENT_INDEX_PARTIAL_FLUSH))
return self
@@ -268,7 +373,10 @@ class AMDProgram(HCQProgram):
self.enable_private_segment_sgpr: int = code.kernel_code_properties & hsa.AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER
additional_alloc_sz = ctypes.sizeof(hsa.hsa_kernel_dispatch_packet_t) if self.enable_dispatch_ptr else 0
super().__init__(CLikeArgsState, self.dev, self.name, kernargs_alloc_size=self.kernargs_segment_size+additional_alloc_sz)
if dev.sqtt_enabled: self.libhash: tuple[int, int] = struct.unpack('<Q', hashlib.md5(self.lib).digest()[:8])*2
super().__init__(CLikeArgsState, self.dev, self.name, kernargs_alloc_size=self.kernargs_segment_size+additional_alloc_sz, lib=self.lib,
base=self.lib_gpu.va_addr)
def __del__(self):
if hasattr(self, 'lib_gpu'): self.dev.allocator.free(self.lib_gpu, self.lib_gpu.size, BufferSpec(cpu_access=True, nolru=True))
@@ -285,6 +393,9 @@ class AMDAllocator(HCQAllocator['AMDDevice']):
MAP_FIXED, MAP_NORESERVE, MAP_LOCKED = 0x10, 0x400, 0 if OSX else 0x2000
@dataclass(frozen=True)
class ProfileSQTTEvent(ProfileEvent): device:str; se:int; blob:bytes; itrace:bool # noqa: E702
@dataclass
class AMDQueueDesc:
ring: memoryview
@@ -600,6 +711,21 @@ class AMDDevice(HCQCompiled):
self.max_private_segment_size = 0
self._ensure_has_local_memory(128) # set default scratch size to 128 bytes per thread
# SQTT is disabled by default because of runtime overhead and big file sizes (~200mb to Tensor.full() two 4096x4096 tensors and matmul them)
self.sqtt_enabled = PROFILE and bool(getenv("SQTT", 0))
if self.sqtt_enabled:
if self.arch != 'gfx1100': raise RuntimeError('SQ Thread Tracing is only supported on 7900XTX')
if not self.driverless and (ppfeaturemask:=int(HWInterface('/sys/module/amdgpu/parameters/ppfeaturemask', os.O_RDONLY).read(), 16)) & 0x8000:
raise RuntimeError("SQTT can't be enabled because of hardware bug, to workaround either use driverless or add "
f"ppfeaturemask={(ppfeaturemask&~0x8000):#x} (current {ppfeaturemask=:#x} & ~PP_GFXOFF_MASK) to amdgpu module parameters\n"
"For more information read https://github.com/tinygrad/tinygrad/blob/master/extra/sqtt/README.md")
SQTT_BUFFER_SIZE = getenv("SQTT_BUFFER_SIZE", 256) # in mb, per shader engine
SQTT_NUM = self.dev_iface.props['array_count'] // self.dev_iface.props['simd_arrays_per_engine']
self.sqtt_buffers = [self.allocator.alloc(SQTT_BUFFER_SIZE*1024*1024, BufferSpec(cpu_access=True, nolru=True)) for _ in range(SQTT_NUM)]
self.sqtt_itrace_se_mask = getenv("SQTT_ITRACE_SE_MASK", 2) # -1 enable all, 0 disable all, >0 bitmask for where to enable instruction tracing
self.cmd_id = 0
AMDComputeQueue().start_trace(self.sqtt_buffers, self.sqtt_itrace_se_mask).submit(self)
def create_queue(self, queue_type, ring_size, ctx_save_restore_size=0, eop_buffer_size=0, ctl_stack_size=0, debug_memory_size=0):
ring = self.dev_iface.alloc(ring_size, uncached=True, cpu_access=True)
gart = self.dev_iface.alloc(0x1000, uncached=True, cpu_access=True)
@@ -629,6 +755,24 @@ class AMDDevice(HCQCompiled):
def on_device_hang(self): self.dev_iface.on_device_hang()
def _at_profile_finalize(self):
if self.sqtt_enabled:
wptrs_buf = self.allocator.alloc(round_up(len(self.sqtt_buffers), 0x1000), BufferSpec(cpu_access=True, nolru=True))
wptrs = to_mv(wptrs_buf.va_addr, wptrs_buf.size)
AMDComputeQueue().stop_trace(len(self.sqtt_buffers), wptrs_buf).signal(self.timeline_signal, self.timeline_value).submit(self)
self.timeline_value += 1
self.synchronize()
if DEBUG>=2: print('Saving SQTT in profile...')
for i,buf0 in enumerate(self.sqtt_buffers):
wptr = ((struct.unpack('<I', wptrs[i*4:i*4+4])[0] & 0x1FFFFFFF) - ((buf0.va_addr//32) & 0x1FFFFFFF)) * 32
if DEBUG>=2: print(f'Se {i} blob size {wptr:#x}')
assert wptr >= 0 and wptr <= buf0.size, f"{wptr} > {buf0.size}, should never happen"
# When sqtt buffer overflows, wptr stops at the last dword
if wptr >= buf0.size-32: print(f"WARNING: SQTT BUFFER IS FULL (SE {i})! INCREASE SQTT BUFFER SIZE WITH SQTT_BUFFER_SIZE=X (in MB)")
self.allocator._copyout(sqtt_buf:=memoryview(bytearray(wptr)), buf0)
Compiled.profile_events += [ProfileSQTTEvent(self.device, i, bytes(sqtt_buf), bool((self.sqtt_itrace_se_mask >> i) & 0b1))]
super()._at_profile_finalize()
def finalize(self):
self.synchronize()
if hasattr(self.dev_iface, 'device_fini'): self.dev_iface.device_fini()

View File

@@ -3,7 +3,7 @@ from typing import cast, Type, TypeVar, Generic, Any, ClassVar
import contextlib, decimal, statistics, time, ctypes, array, os, fcntl
from tinygrad.helpers import PROFILE, from_mv, getenv, to_mv, round_up
from tinygrad.renderer import Renderer
from tinygrad.device import BufferSpec, Compiler, Compiled, LRUAllocator, ProfileRangeEvent, ProfileDeviceEvent
from tinygrad.device import BufferSpec, Compiler, Compiled, LRUAllocator, ProfileRangeEvent, ProfileDeviceEvent, ProfileProgramEvent
from tinygrad.ops import sym_infer, sint, Variable, UOp
from tinygrad.runtime.autogen import libc
@@ -290,8 +290,9 @@ class CLikeArgsState(HCQArgsState[ProgramType]):
self.bind_sints_to_ptr(*vals, ptr=self.ptr + len(prefix or []) * 4 + len(bufs) * 8, fmt='I')
class HCQProgram(Generic[DeviceType]):
def __init__(self, args_state_t:Type[HCQArgsState], dev:DeviceType, name:str, kernargs_alloc_size:int):
def __init__(self, args_state_t:Type[HCQArgsState], dev:DeviceType, name:str, kernargs_alloc_size:int, lib:bytes|None=None, base:int|None=None):
self.args_state_t, self.dev, self.name, self.kernargs_alloc_size = args_state_t, dev, name, kernargs_alloc_size
if PROFILE: Compiled.profile_events += [ProfileProgramEvent(dev.device, name, lib, base)]
def fill_kernargs(self, bufs:tuple[HCQBuffer, ...], vals:tuple[int, ...]=(), kernargs_ptr:int|None=None) -> HCQArgsState:
"""