mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-06-13 00:15:35 +08:00
SQTT profiling (#9278)
* sqtt * docs * multi-device * ProfileSQTTEvent * exec update * 256mb default * don't let people hang their gpus * bitfields from autogen * asic info from mesa * more bitfields from autogen * SQTT_ITRACE_SE_MASK --------- Co-authored-by: George Hotz <72895+geohot@users.noreply.github.com>
This commit is contained in:
3
.github/workflows/test.yml
vendored
3
.github/workflows/test.yml
vendored
@@ -118,12 +118,15 @@ jobs:
|
||||
cp tinygrad/runtime/autogen/hsa.py /tmp/hsa.py.bak
|
||||
cp tinygrad/runtime/autogen/comgr.py /tmp/comgr.py.bak
|
||||
cp tinygrad/runtime/autogen/amd_gpu.py /tmp/amd_gpu.py.bak
|
||||
cp tinygrad/runtime/autogen/sqtt.py /tmp/sqtt.py.bak
|
||||
./autogen_stubs.sh hsa
|
||||
./autogen_stubs.sh comgr
|
||||
./autogen_stubs.sh amd
|
||||
./autogen_stubs.sh sqtt
|
||||
diff /tmp/hsa.py.bak tinygrad/runtime/autogen/hsa.py
|
||||
diff /tmp/comgr.py.bak tinygrad/runtime/autogen/comgr.py
|
||||
diff /tmp/amd_gpu.py.bak tinygrad/runtime/autogen/amd_gpu.py
|
||||
diff /tmp/sqtt.py.bak tinygrad/runtime/autogen/sqtt.py
|
||||
- name: Verify Linux autogen
|
||||
run: |
|
||||
cp tinygrad/runtime/autogen/libc.py /tmp/libc.py.bak
|
||||
|
||||
@@ -362,6 +362,16 @@ generate_am() {
|
||||
fixup $BASE/am/hdp_6_0_0.py
|
||||
}
|
||||
|
||||
generate_sqtt() {
|
||||
clang2py -k cdefstum \
|
||||
extra/sqtt/sqtt.h \
|
||||
-o $BASE/sqtt.py
|
||||
|
||||
fixup $BASE/sqtt.py
|
||||
sed -i "s\import ctypes\import ctypes, os\g" $BASE/sqtt.py
|
||||
python3 -c "import tinygrad.runtime.autogen.sqtt"
|
||||
}
|
||||
|
||||
generate_webgpu() {
|
||||
clang2py -l /usr/local/lib/libwebgpu_dawn.so extra/webgpu/webgpu.h -o $BASE/webgpu.py
|
||||
fixup $BASE/webgpu.py
|
||||
@@ -380,6 +390,7 @@ elif [ "$1" == "kfd" ]; then generate_kfd
|
||||
elif [ "$1" == "nv" ]; then generate_nv
|
||||
elif [ "$1" == "amd" ]; then generate_amd
|
||||
elif [ "$1" == "am" ]; then generate_am
|
||||
elif [ "$1" == "sqtt" ]; then generate_sqtt
|
||||
elif [ "$1" == "qcom" ]; then generate_qcom
|
||||
elif [ "$1" == "io_uring" ]; then generate_io_uring
|
||||
elif [ "$1" == "libc" ]; then generate_libc
|
||||
|
||||
33
extra/sqtt/README.md
Normal file
33
extra/sqtt/README.md
Normal file
@@ -0,0 +1,33 @@
|
||||
# SQTT Profiling
|
||||
|
||||
## Getting SQ Thread Trace
|
||||
|
||||
Only supported on 7900XTX, requires either AM (`rmmod amdgpu`) or disabling power gating on AMD (`ppfeaturemask=0xffff3fff`, don't forget to rebuild initramfs)
|
||||
|
||||
SQTT is implemented on top of normal tinygrad PROFILE=1, `PROFILE=1 SQTT=1` to get profile pickle with sqtt data embedded in it.
|
||||
|
||||
`SQTT_BUFFER_SIZE=X` to change size of SQTT buffer (per shader engine, 6 SEs on 7900xtx) in megabytes, default 256.
|
||||
|
||||
`SQTT_ITRACE_SE_MASK=X` to select for which shader engines instruction tracing will be enabled, -1 is all, 0 is none (instruction tracing disabled), >0 is
|
||||
bitfield/mask for SEs to enable instruction tracing on. Masking shader engines will give smaller file sizes at a cost of less hits and kernels that
|
||||
don't have any wavefront on first simd of shdaer engine with instruction tracing enabled will not have instruction timings.
|
||||
The default is 2 (second shader engine only), only one for file size reasons, second instead of first because dispatch starts from it so there is
|
||||
greater chance that kernels with small global size will have instruction tracing data.
|
||||
|
||||
Note that instruction tracing might not be available for kernels with small global dims, this is not a bug, but it can be improved with various hacks
|
||||
to the point where it can reliably trace a kernel consisting of a single wavefront (am only, not quite reliable under amdgpu due to waves sometimes
|
||||
being dispatched starting from different simds). More info in comments in ops_amd.py
|
||||
|
||||
## Converting pickled profile with SQTT data into RGP file
|
||||
|
||||
```bash
|
||||
extra/sqtt/rgptool.py create "/tmp/profile.pkl.$USER" -o /tmp/gpu0.rgp
|
||||
```
|
||||
|
||||
Then load gpu0.rgp into Radeon GPU Profiler. It works just fine both in wine (macos, native version available for linux) and via ssh X forwarding
|
||||
|
||||
If multiplle gpus are used you can select which one to export with `-d` like this:
|
||||
|
||||
```bash
|
||||
extra/sqtt/rgptool.py create "/tmp/profile.pkl.$USER" -d 'AMD:5' -o /tmp/gpu5.rgp
|
||||
```
|
||||
330
extra/sqtt/rgptool.py
Executable file
330
extra/sqtt/rgptool.py
Executable file
@@ -0,0 +1,330 @@
|
||||
#!/usr/bin/env python3
|
||||
from __future__ import annotations
|
||||
import argparse, ctypes, struct, hashlib, pickle, code, typing, functools
|
||||
import tinygrad.runtime.autogen.sqtt as sqtt
|
||||
from tinygrad.device import ProfileEvent, ProfileDeviceEvent, ProfileProgramEvent
|
||||
from tinygrad.runtime.ops_amd import ProfileSQTTEvent
|
||||
from tinygrad.helpers import round_up, flatten, all_same
|
||||
from dataclasses import dataclass
|
||||
|
||||
CHUNK_CLASSES = {
|
||||
sqtt.SQTT_FILE_CHUNK_TYPE_ASIC_INFO: sqtt.struct_sqtt_file_chunk_asic_info,
|
||||
sqtt.SQTT_FILE_CHUNK_TYPE_SQTT_DESC: sqtt.struct_sqtt_file_chunk_sqtt_desc,
|
||||
sqtt.SQTT_FILE_CHUNK_TYPE_SQTT_DATA: sqtt.struct_sqtt_file_chunk_sqtt_data,
|
||||
sqtt.SQTT_FILE_CHUNK_TYPE_API_INFO: sqtt.struct_sqtt_file_chunk_api_info,
|
||||
sqtt.SQTT_FILE_CHUNK_TYPE_QUEUE_EVENT_TIMINGS: sqtt.struct_sqtt_file_chunk_queue_event_timings,
|
||||
sqtt.SQTT_FILE_CHUNK_TYPE_CLOCK_CALIBRATION: sqtt.struct_sqtt_file_chunk_clock_calibration,
|
||||
sqtt.SQTT_FILE_CHUNK_TYPE_CPU_INFO: sqtt.struct_sqtt_file_chunk_cpu_info,
|
||||
sqtt.SQTT_FILE_CHUNK_TYPE_SPM_DB: sqtt.struct_sqtt_file_chunk_spm_db,
|
||||
sqtt.SQTT_FILE_CHUNK_TYPE_CODE_OBJECT_DATABASE: sqtt.struct_sqtt_file_chunk_code_object_database,
|
||||
sqtt.SQTT_FILE_CHUNK_TYPE_CODE_OBJECT_LOADER_EVENTS: sqtt.struct_sqtt_file_chunk_code_object_loader_events,
|
||||
sqtt.SQTT_FILE_CHUNK_TYPE_PSO_CORRELATION: sqtt.struct_sqtt_file_chunk_pso_correlation,
|
||||
}
|
||||
|
||||
def pretty(val, pad=0) -> str:
|
||||
if isinstance(val, ctypes.Structure) or isinstance(val, ctypes.Union):
|
||||
nl = '\n' # old python versions don't support \ in f-strings
|
||||
return f"{val.__class__.__name__}({nl}{' '*(pad+2)}{(f', {nl}'+' '*(pad+2)).join([f'{field[0]}={pretty(getattr(val, field[0]), pad=pad+2)}' for field in val._fields_])}{nl}{' '*pad})"
|
||||
if isinstance(val, ctypes.Array):
|
||||
return f"[{', '.join(map(pretty, val))}]"
|
||||
if isinstance(val, int) and val >= 1024: return hex(val)
|
||||
return repr(val)
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class RGPChunk:
|
||||
header: sqtt.Structure
|
||||
data: list[typing.Any]|list[tuple[typing.Any, bytes]]|bytes|None = None
|
||||
def print(self):
|
||||
print(pretty(self.header))
|
||||
# if isinstance(self.data, bytes): print(repr(self.data))
|
||||
if isinstance(self.data, list):
|
||||
for dchunk in self.data:
|
||||
if isinstance(dchunk, tuple):
|
||||
print(pretty(dchunk[0]))
|
||||
# print(repr(dchunk[1]))
|
||||
else:
|
||||
print(pretty(dchunk))
|
||||
# TODO: `def fixup` and true immutability
|
||||
def to_bytes(self, offset:int) -> bytes:
|
||||
cid = self.header.header.chunk_id.type
|
||||
match cid:
|
||||
case _ if cid in {sqtt.SQTT_FILE_CHUNK_TYPE_ASIC_INFO, sqtt.SQTT_FILE_CHUNK_TYPE_CPU_INFO, sqtt.SQTT_FILE_CHUNK_TYPE_API_INFO, sqtt.SQTT_FILE_CHUNK_TYPE_SQTT_DESC}:
|
||||
self.header.header.size_in_bytes = ctypes.sizeof(self.header)
|
||||
return bytes(self.header)
|
||||
case sqtt.SQTT_FILE_CHUNK_TYPE_SQTT_DATA:
|
||||
assert isinstance(self.data, bytes)
|
||||
self.header.header.size_in_bytes = ctypes.sizeof(self.header) + len(self.data)
|
||||
self.header.offset = offset+ctypes.sizeof(self.header)
|
||||
self.header.size = len(self.data)
|
||||
return bytes(self.header) + self.data
|
||||
case sqtt.SQTT_FILE_CHUNK_TYPE_CODE_OBJECT_DATABASE:
|
||||
assert isinstance(self.data, list)
|
||||
data_codb = typing.cast(list[tuple[sqtt.struct_sqtt_code_object_database_record, bytes]], self.data)
|
||||
ret = bytearray()
|
||||
sz = ctypes.sizeof(self.header)+sum([ctypes.sizeof(record_hdr)+round_up(len(record_blob), 4) for record_hdr,record_blob in data_codb])
|
||||
self.header.header.size_in_bytes = sz
|
||||
self.header.offset = offset
|
||||
self.header.record_count = len(data_codb)
|
||||
self.header.size = sz
|
||||
ret += self.header
|
||||
for record_hdr,record_blob in data_codb:
|
||||
record_hdr.size = round_up(len(record_blob), 4)
|
||||
ret += record_hdr
|
||||
ret += record_blob.ljust(4, b'\x00')
|
||||
return ret
|
||||
case sqtt.SQTT_FILE_CHUNK_TYPE_CODE_OBJECT_LOADER_EVENTS:
|
||||
assert isinstance(self.data, list)
|
||||
data_lev = typing.cast(list[tuple[sqtt.struct_sqtt_code_object_loader_events_record]], self.data)
|
||||
self.header.header.size_in_bytes = ctypes.sizeof(self.header)+ctypes.sizeof(sqtt.struct_sqtt_code_object_loader_events_record)*len(data_lev)
|
||||
self.header.offset = offset
|
||||
self.header.record_size = ctypes.sizeof(sqtt.struct_sqtt_code_object_loader_events_record)
|
||||
self.header.record_count = len(data_lev)
|
||||
return bytes(self.header) + b''.join(map(bytes, data_lev))
|
||||
case sqtt.SQTT_FILE_CHUNK_TYPE_PSO_CORRELATION:
|
||||
assert isinstance(self.data, list)
|
||||
data_pso = typing.cast(list[tuple[sqtt.struct_sqtt_pso_correlation_record]], self.data)
|
||||
self.header.header.size_in_bytes = ctypes.sizeof(self.header)+ctypes.sizeof(sqtt.struct_sqtt_pso_correlation_record)*len(data_pso)
|
||||
self.header.offset = offset
|
||||
self.header.record_size = ctypes.sizeof(sqtt.struct_sqtt_pso_correlation_record)
|
||||
self.header.record_count = len(data_pso)
|
||||
return bytes(self.header) + b''.join(map(bytes, data_pso))
|
||||
case _: raise NotImplementedError(pretty(self.header))
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class RGP:
|
||||
header: sqtt.struct_sqtt_file_header
|
||||
chunks: list[RGPChunk]
|
||||
@staticmethod
|
||||
def from_bytes(blob: bytes) -> RGP:
|
||||
file_header = sqtt.struct_sqtt_file_header.from_buffer_copy(blob)
|
||||
assert file_header.magic_number == sqtt.SQTT_FILE_MAGIC_NUMBER and file_header.version_major == sqtt.SQTT_FILE_VERSION_MAJOR
|
||||
i = file_header.chunk_offset
|
||||
chunks = []
|
||||
while i < len(blob):
|
||||
assert i%4==0, hex(i)
|
||||
hdr = sqtt.struct_sqtt_file_chunk_header.from_buffer_copy(blob, i)
|
||||
cid = hdr.chunk_id.type
|
||||
header: ctypes.Structure
|
||||
match cid:
|
||||
case _ if cid in {sqtt.SQTT_FILE_CHUNK_TYPE_RESERVED, sqtt.SQTT_FILE_CHUNK_TYPE_QUEUE_EVENT_TIMINGS, sqtt.SQTT_FILE_CHUNK_TYPE_CLOCK_CALIBRATION, sqtt.SQTT_FILE_CHUNK_TYPE_SPM_DB}:
|
||||
chunk = None
|
||||
case sqtt.SQTT_FILE_CHUNK_TYPE_CODE_OBJECT_DATABASE:
|
||||
header = sqtt.struct_sqtt_file_chunk_code_object_database.from_buffer_copy(blob, i)
|
||||
j = header.offset + ctypes.sizeof(header)
|
||||
data: list = []
|
||||
while j < header.offset + header.size:
|
||||
rec_hdr: ctypes.Structure = sqtt.struct_sqtt_code_object_database_record.from_buffer_copy(blob, j)
|
||||
data.append((rec_hdr, elf:=blob[j+ctypes.sizeof(rec_hdr):j+ctypes.sizeof(rec_hdr)+rec_hdr.size]))
|
||||
assert elf[:4] == b'\x7fELF', repr(elf[:16])
|
||||
j += ctypes.sizeof(rec_hdr)+rec_hdr.size
|
||||
assert len(data) == header.record_count
|
||||
chunk = RGPChunk(header, data)
|
||||
case sqtt.SQTT_FILE_CHUNK_TYPE_CODE_OBJECT_LOADER_EVENTS:
|
||||
header = sqtt.struct_sqtt_file_chunk_code_object_loader_events.from_buffer_copy(blob, i)
|
||||
data = [sqtt.struct_sqtt_code_object_loader_events_record.from_buffer_copy(blob, header.offset+ctypes.sizeof(header)+j*header.record_size)
|
||||
for j in range(header.record_count)]
|
||||
chunk = RGPChunk(header, data)
|
||||
case sqtt.SQTT_FILE_CHUNK_TYPE_PSO_CORRELATION:
|
||||
header = sqtt.struct_sqtt_file_chunk_pso_correlation.from_buffer_copy(blob, i)
|
||||
data = [sqtt.struct_sqtt_pso_correlation_record.from_buffer_copy(blob, header.offset+ctypes.sizeof(header)+j*header.record_size)
|
||||
for j in range(header.record_count)]
|
||||
chunk = RGPChunk(header, data)
|
||||
case sqtt.SQTT_FILE_CHUNK_TYPE_SQTT_DATA:
|
||||
header = sqtt.struct_sqtt_file_chunk_sqtt_data.from_buffer_copy(blob, i)
|
||||
chunk = RGPChunk(header, blob[header.offset:header.offset+header.size])
|
||||
case _ if cid in {sqtt.SQTT_FILE_CHUNK_TYPE_ASIC_INFO, sqtt.SQTT_FILE_CHUNK_TYPE_CPU_INFO, sqtt.SQTT_FILE_CHUNK_TYPE_API_INFO,
|
||||
sqtt.SQTT_FILE_CHUNK_TYPE_SQTT_DESC}:
|
||||
chunk = RGPChunk(CHUNK_CLASSES[cid].from_buffer_copy(blob, i))
|
||||
case _:
|
||||
chunk = None
|
||||
print(f"unknown chunk id {cid}")
|
||||
if chunk is not None: chunks.append(chunk)
|
||||
i += hdr.size_in_bytes
|
||||
assert i == len(blob), f'{i} != {len(blob)}'
|
||||
return RGP(file_header, chunks)
|
||||
@staticmethod
|
||||
def from_profile(profile_pickled, device:str|None=None):
|
||||
profile: list[ProfileEvent] = pickle.loads(profile_pickled)
|
||||
device_events = {x.device:x for x in profile if isinstance(x, ProfileDeviceEvent) and x.device.startswith('AMD')}
|
||||
if device is None:
|
||||
if len(device_events) == 0: raise RuntimeError('No supported devices found in profile')
|
||||
if len(device_events) > 1: raise RuntimeError(f"More than one supported device found, select which one to export: {', '.join(device_events.keys())}")
|
||||
_, device_event = device_events.popitem()
|
||||
else:
|
||||
if device not in device_events: raise RuntimeError(f"Device {device} not found in profile, devices in profile: {', '.join(device_events.keys())} ")
|
||||
device_event = device_events[device]
|
||||
sqtt_events = [x for x in profile if isinstance(x, ProfileSQTTEvent) and x.device == device_event.device]
|
||||
if len(sqtt_events) == 0: raise RuntimeError(f"Device {device_event.device} doesn't contain SQTT data")
|
||||
sqtt_itrace_enabled = any([event.itrace for event in sqtt_events])
|
||||
sqtt_itrace_masked = not all_same([event.itrace for event in sqtt_events])
|
||||
sqtt_itrace_se_mask = functools.reduce(lambda a,b: a|b, [int(event.itrace) << event.se for event in sqtt_events], 0) if sqtt_itrace_masked else 0
|
||||
load_events = [x for x in profile if isinstance(x, ProfileProgramEvent) and x.device == device_event.device]
|
||||
loads = [(event.base, struct.unpack('<Q', hashlib.md5(event.lib).digest()[:8])*2) for event in load_events if event.base is not None and event.lib is not None]
|
||||
code_objects = list(dict.fromkeys([x.lib for x in load_events if x.lib is not None]).keys())
|
||||
if len(loads) == 0: raise RuntimeError('No load events in profile')
|
||||
# TODO: tons of stuff hardcoded for 7900xtx
|
||||
file_header = sqtt.struct_sqtt_file_header(
|
||||
magic_number=sqtt.SQTT_FILE_MAGIC_NUMBER,
|
||||
version_major=sqtt.SQTT_FILE_VERSION_MAJOR,
|
||||
version_minor=sqtt.SQTT_FILE_VERSION_MINOR,
|
||||
flags=sqtt.struct_sqtt_file_header_flags(
|
||||
_0=sqtt.union_sqtt_file_header_flags_0(value=1),
|
||||
),
|
||||
chunk_offset=ctypes.sizeof(sqtt.struct_sqtt_file_header),
|
||||
)
|
||||
chunks = [
|
||||
RGPChunk(sqtt.struct_sqtt_file_chunk_cpu_info(
|
||||
header=sqtt.struct_sqtt_file_chunk_header(
|
||||
chunk_id=sqtt.struct_sqtt_file_chunk_id(type=sqtt.SQTT_FILE_CHUNK_TYPE_CPU_INFO),
|
||||
major_version=0, minor_version=0,
|
||||
),
|
||||
cpu_timestamp_freq=1000000000,
|
||||
clock_speed=2994, # in mhz???
|
||||
num_logical_cores=64,
|
||||
num_physical_cores=32,
|
||||
system_ram_size=256*1024, # in mb???
|
||||
)),
|
||||
RGPChunk(sqtt.struct_sqtt_file_chunk_asic_info(
|
||||
header=sqtt.struct_sqtt_file_chunk_header(
|
||||
chunk_id=sqtt.struct_sqtt_file_chunk_id(type=sqtt.SQTT_FILE_CHUNK_TYPE_ASIC_INFO),
|
||||
major_version=0, minor_version=5,
|
||||
),
|
||||
flags=0,
|
||||
trace_shader_core_clock=0x93f05080,
|
||||
trace_memory_clock=0x4a723a40,
|
||||
device_id=0x744c,
|
||||
device_revision_id=0xc8,
|
||||
vgprs_per_simd=1536,
|
||||
sgprs_per_simd=128*16,
|
||||
shader_engines=6,
|
||||
compute_unit_per_shader_engine=16,
|
||||
simd_per_compute_unit=2,
|
||||
wavefronts_per_simd=16,
|
||||
minimum_vgpr_alloc=4,
|
||||
vgpr_alloc_granularity=8,
|
||||
minimum_sgpr_alloc=128,
|
||||
sgpr_alloc_granularity=128,
|
||||
hardware_contexts=8,
|
||||
gpu_type=sqtt.SQTT_GPU_TYPE_DISCRETE,
|
||||
gfxip_level=sqtt.SQTT_GFXIP_LEVEL_GFXIP_11_0,
|
||||
gpu_index=0,
|
||||
gds_size=0,
|
||||
gds_per_shader_engine=0,
|
||||
ce_ram_size=0,
|
||||
ce_ram_size_graphics=0,
|
||||
ce_ram_size_compute=0,
|
||||
max_number_of_dedicated_cus=0,
|
||||
vram_size=24 * 1024 * 1024 * 1024, # 24 GB
|
||||
vram_bus_width=384, # 384-bit
|
||||
l2_cache_size=6 * 1024 * 1024, # 6 MB
|
||||
l1_cache_size=32 * 1024, # 32 KB per SIMD (?)
|
||||
lds_size=65536, # 64 KB per CU
|
||||
gpu_name=b'NAVI31',
|
||||
alu_per_clock=0,
|
||||
texture_per_clock=0,
|
||||
prims_per_clock=6,
|
||||
pixels_per_clock=0,
|
||||
gpu_timestamp_frequency=100000000, # 100 MHz
|
||||
max_shader_core_clock=2500000000, # 2.5 GHz (boost clock)
|
||||
max_memory_clock=1250000000, # 1.25 GHz
|
||||
memory_ops_per_clock=16,
|
||||
memory_chip_type=sqtt.SQTT_MEMORY_TYPE_GDDR6,
|
||||
lds_granularity=512,
|
||||
cu_mask=((255, 255),)*6 + ((0,0),)*(32-6),
|
||||
gl1_cache_size=256 * 1024, # 256 KB
|
||||
instruction_cache_size=32 * 1024, # 32 KB
|
||||
scalar_cache_size=16 * 1024, # 16 KB
|
||||
mall_cache_size=96 * 1024 * 1024, # 96 MB
|
||||
)),
|
||||
RGPChunk(sqtt.struct_sqtt_file_chunk_api_info(
|
||||
header=sqtt.struct_sqtt_file_chunk_header(
|
||||
chunk_id=sqtt.struct_sqtt_file_chunk_id(type=sqtt.SQTT_FILE_CHUNK_TYPE_API_INFO),
|
||||
major_version=0,
|
||||
minor_version=2,
|
||||
),
|
||||
api_type=5, # HIP, not in enum
|
||||
major_version=12, minor_version=0,
|
||||
profiling_mode=sqtt.SQTT_PROFILING_MODE_PRESENT,
|
||||
instruction_trace_mode=sqtt.SQTT_INSTRUCTION_TRACE_FULL_FRAME if sqtt_itrace_enabled else sqtt.SQTT_INSTRUCTION_TRACE_DISABLED,
|
||||
instruction_trace_data=sqtt.union_sqtt_instruction_trace_data(
|
||||
shader_engine_filter=sqtt.struct_sqtt_instruction_trace_data_shader_engine_filter(mask=sqtt_itrace_se_mask),
|
||||
),
|
||||
)),
|
||||
*flatten([(
|
||||
RGPChunk(sqtt.struct_sqtt_file_chunk_sqtt_desc(
|
||||
header=sqtt.struct_sqtt_file_chunk_header(
|
||||
chunk_id=sqtt.struct_sqtt_file_chunk_id(type=sqtt.SQTT_FILE_CHUNK_TYPE_SQTT_DESC, index=sqtt_event.se),
|
||||
major_version=0, minor_version=2,
|
||||
),
|
||||
shader_engine_index=sqtt_event.se,
|
||||
sqtt_version=sqtt.SQTT_VERSION_3_2,
|
||||
_0=sqtt.union_sqtt_file_chunk_sqtt_desc_0(
|
||||
v1=sqtt.struct_sqtt_file_chunk_sqtt_desc_0_v1(
|
||||
instrumentation_spec_version=1,
|
||||
instrumentation_api_version=0,
|
||||
compute_unit_index=0,
|
||||
)
|
||||
),
|
||||
)),
|
||||
RGPChunk(sqtt.struct_sqtt_file_chunk_sqtt_data(
|
||||
header=sqtt.struct_sqtt_file_chunk_header(
|
||||
chunk_id=sqtt.struct_sqtt_file_chunk_id(type=sqtt.SQTT_FILE_CHUNK_TYPE_SQTT_DATA, index=sqtt_event.se),
|
||||
major_version=0, minor_version=0,
|
||||
),
|
||||
), sqtt_event.blob),
|
||||
) for sqtt_event in sqtt_events]),
|
||||
RGPChunk(sqtt.struct_sqtt_file_chunk_code_object_database(
|
||||
header=sqtt.struct_sqtt_file_chunk_header(
|
||||
chunk_id=sqtt.struct_sqtt_file_chunk_id(type=sqtt.SQTT_FILE_CHUNK_TYPE_CODE_OBJECT_DATABASE),
|
||||
major_version=0, minor_version=0,
|
||||
),
|
||||
), [(sqtt.struct_sqtt_code_object_database_record(), lib) for lib in code_objects]),
|
||||
RGPChunk(sqtt.struct_sqtt_file_chunk_code_object_loader_events(
|
||||
header=sqtt.struct_sqtt_file_chunk_header(
|
||||
chunk_id=sqtt.struct_sqtt_file_chunk_id(type=sqtt.SQTT_FILE_CHUNK_TYPE_CODE_OBJECT_LOADER_EVENTS),
|
||||
major_version=1, minor_version=0,
|
||||
),
|
||||
), [sqtt.struct_sqtt_code_object_loader_events_record(base_address=base, code_object_hash=hash) for base,hash in loads]),
|
||||
RGPChunk(sqtt.struct_sqtt_file_chunk_pso_correlation(
|
||||
header=sqtt.struct_sqtt_file_chunk_header(
|
||||
chunk_id=sqtt.struct_sqtt_file_chunk_id(type=sqtt.SQTT_FILE_CHUNK_TYPE_PSO_CORRELATION),
|
||||
major_version=0, minor_version=0,
|
||||
),
|
||||
), [sqtt.struct_sqtt_pso_correlation_record(api_pso_hash=hash[0], pipeline_hash=hash) for _,hash in loads])
|
||||
]
|
||||
return RGP(file_header, chunks)
|
||||
def to_bytes(self) -> bytes:
|
||||
ret = bytearray()
|
||||
ret += self.header
|
||||
for chunk in self.chunks:
|
||||
ret += chunk.to_bytes(len(ret))
|
||||
return bytes(ret)
|
||||
def print(self):
|
||||
print(pretty(self.header))
|
||||
for chunk in self.chunks: chunk.print()
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(prog='rgptool', description='A tool to create (from pickled tinygrad profile), inspect and modify Radeon GPU Profiler files')
|
||||
parser.add_argument('command')
|
||||
parser.add_argument('input')
|
||||
parser.add_argument('-d', '--device')
|
||||
parser.add_argument('-o', '--output')
|
||||
args = parser.parse_args()
|
||||
|
||||
with open(args.input, 'rb') as fd: input_bytes = fd.read()
|
||||
|
||||
match args.command:
|
||||
case 'print':
|
||||
rgp = RGP.from_bytes(input_bytes)
|
||||
rgp.print()
|
||||
case 'create':
|
||||
rgp = RGP.from_profile(input_bytes, device=args.device)
|
||||
# rgp.to_bytes() # fixup
|
||||
# rgp.print()
|
||||
case 'repl':
|
||||
rgp = RGP.from_bytes(input_bytes)
|
||||
code.interact(local=locals())
|
||||
case _: raise RuntimeError(args.command)
|
||||
|
||||
if args.output is not None:
|
||||
with open(args.output, 'wb+') as fd: fd.write(rgp.to_bytes())
|
||||
840
extra/sqtt/sqtt.h
Normal file
840
extra/sqtt/sqtt.h
Normal file
@@ -0,0 +1,840 @@
|
||||
#include <stdint.h>
|
||||
|
||||
// Original definition in pal is in c++ and clang2py can't autogen it correctly
|
||||
// Most of this is copy pasted from mesa/src/amd/common/ac_rgp.{h, c}
|
||||
|
||||
/*
|
||||
* Copyright 2020 Advanced Micro Devices, Inc.
|
||||
* Copyright 2020 Valve Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
#define SQTT_FILE_MAGIC_NUMBER 0x50303042
|
||||
#define SQTT_FILE_VERSION_MAJOR 1
|
||||
#define SQTT_FILE_VERSION_MINOR 5
|
||||
|
||||
#define SQTT_GPU_NAME_MAX_SIZE 256
|
||||
#define SQTT_MAX_NUM_SE 32
|
||||
#define SQTT_SA_PER_SE 2
|
||||
#define SQTT_ACTIVE_PIXEL_PACKER_MASK_DWORDS 4
|
||||
|
||||
struct sqtt_data_info {
|
||||
uint32_t cur_offset;
|
||||
uint32_t trace_status;
|
||||
union {
|
||||
uint32_t gfx9_write_counter;
|
||||
uint32_t gfx10_dropped_cntr;
|
||||
};
|
||||
};
|
||||
|
||||
struct sqtt_data_se {
|
||||
struct sqtt_data_info info;
|
||||
void *data_ptr;
|
||||
uint32_t shader_engine;
|
||||
uint32_t compute_unit;
|
||||
};
|
||||
|
||||
|
||||
enum sqtt_version
|
||||
{
|
||||
SQTT_VERSION_NONE = 0x0,
|
||||
SQTT_VERSION_2_2 = 0x5, /* GFX8 */
|
||||
SQTT_VERSION_2_3 = 0x6, /* GFX9 */
|
||||
SQTT_VERSION_2_4 = 0x7, /* GFX10+ */
|
||||
SQTT_VERSION_3_2 = 0xb, /* GFX11+ */
|
||||
};
|
||||
|
||||
enum sqtt_file_chunk_type
|
||||
{
|
||||
SQTT_FILE_CHUNK_TYPE_ASIC_INFO,
|
||||
SQTT_FILE_CHUNK_TYPE_SQTT_DESC,
|
||||
SQTT_FILE_CHUNK_TYPE_SQTT_DATA,
|
||||
SQTT_FILE_CHUNK_TYPE_API_INFO,
|
||||
SQTT_FILE_CHUNK_TYPE_RESERVED,
|
||||
SQTT_FILE_CHUNK_TYPE_QUEUE_EVENT_TIMINGS,
|
||||
SQTT_FILE_CHUNK_TYPE_CLOCK_CALIBRATION,
|
||||
SQTT_FILE_CHUNK_TYPE_CPU_INFO,
|
||||
SQTT_FILE_CHUNK_TYPE_SPM_DB,
|
||||
SQTT_FILE_CHUNK_TYPE_CODE_OBJECT_DATABASE,
|
||||
SQTT_FILE_CHUNK_TYPE_CODE_OBJECT_LOADER_EVENTS,
|
||||
SQTT_FILE_CHUNK_TYPE_PSO_CORRELATION,
|
||||
SQTT_FILE_CHUNK_TYPE_INSTRUMENTATION_TABLE,
|
||||
SQTT_FILE_CHUNK_TYPE_COUNT
|
||||
};
|
||||
|
||||
|
||||
struct sqtt_file_chunk_id {
|
||||
int32_t type : 8;
|
||||
int32_t index : 8;
|
||||
int32_t reserved : 16;
|
||||
};
|
||||
|
||||
struct sqtt_file_chunk_header {
|
||||
struct sqtt_file_chunk_id chunk_id;
|
||||
uint16_t minor_version;
|
||||
uint16_t major_version;
|
||||
int32_t size_in_bytes;
|
||||
int32_t padding;
|
||||
};
|
||||
|
||||
struct sqtt_file_header_flags {
|
||||
union {
|
||||
struct {
|
||||
uint32_t is_semaphore_queue_timing_etw : 1;
|
||||
uint32_t no_queue_semaphore_timestamps : 1;
|
||||
uint32_t reserved : 30;
|
||||
};
|
||||
|
||||
uint32_t value;
|
||||
};
|
||||
};
|
||||
|
||||
struct sqtt_file_header {
|
||||
uint32_t magic_number;
|
||||
uint32_t version_major;
|
||||
uint32_t version_minor;
|
||||
struct sqtt_file_header_flags flags;
|
||||
int32_t chunk_offset;
|
||||
int32_t second;
|
||||
int32_t minute;
|
||||
int32_t hour;
|
||||
int32_t day_in_month;
|
||||
int32_t month;
|
||||
int32_t year;
|
||||
int32_t day_in_week;
|
||||
int32_t day_in_year;
|
||||
int32_t is_daylight_savings;
|
||||
};
|
||||
|
||||
struct sqtt_file_chunk_cpu_info {
|
||||
struct sqtt_file_chunk_header header;
|
||||
uint32_t vendor_id[4];
|
||||
uint32_t processor_brand[12];
|
||||
uint32_t reserved[2];
|
||||
uint64_t cpu_timestamp_freq;
|
||||
uint32_t clock_speed;
|
||||
uint32_t num_logical_cores;
|
||||
uint32_t num_physical_cores;
|
||||
uint32_t system_ram_size;
|
||||
};
|
||||
|
||||
enum sqtt_file_chunk_asic_info_flags
|
||||
{
|
||||
SQTT_FILE_CHUNK_ASIC_INFO_FLAG_SC_PACKER_NUMBERING = (1 << 0),
|
||||
SQTT_FILE_CHUNK_ASIC_INFO_FLAG_PS1_EVENT_TOKENS_ENABLED = (1 << 1)
|
||||
};
|
||||
|
||||
enum sqtt_gpu_type
|
||||
{
|
||||
SQTT_GPU_TYPE_UNKNOWN = 0x0,
|
||||
SQTT_GPU_TYPE_INTEGRATED = 0x1,
|
||||
SQTT_GPU_TYPE_DISCRETE = 0x2,
|
||||
SQTT_GPU_TYPE_VIRTUAL = 0x3
|
||||
};
|
||||
|
||||
enum sqtt_gfxip_level
|
||||
{
|
||||
SQTT_GFXIP_LEVEL_NONE = 0x0,
|
||||
SQTT_GFXIP_LEVEL_GFXIP_6 = 0x1,
|
||||
SQTT_GFXIP_LEVEL_GFXIP_7 = 0x2,
|
||||
SQTT_GFXIP_LEVEL_GFXIP_8 = 0x3,
|
||||
SQTT_GFXIP_LEVEL_GFXIP_8_1 = 0x4,
|
||||
SQTT_GFXIP_LEVEL_GFXIP_9 = 0x5,
|
||||
SQTT_GFXIP_LEVEL_GFXIP_10_1 = 0x7,
|
||||
SQTT_GFXIP_LEVEL_GFXIP_10_3 = 0x9,
|
||||
SQTT_GFXIP_LEVEL_GFXIP_11_0 = 0xc,
|
||||
};
|
||||
|
||||
enum sqtt_memory_type
|
||||
{
|
||||
SQTT_MEMORY_TYPE_UNKNOWN = 0x0,
|
||||
SQTT_MEMORY_TYPE_DDR = 0x1,
|
||||
SQTT_MEMORY_TYPE_DDR2 = 0x2,
|
||||
SQTT_MEMORY_TYPE_DDR3 = 0x3,
|
||||
SQTT_MEMORY_TYPE_DDR4 = 0x4,
|
||||
SQTT_MEMORY_TYPE_DDR5 = 0x5,
|
||||
SQTT_MEMORY_TYPE_GDDR3 = 0x10,
|
||||
SQTT_MEMORY_TYPE_GDDR4 = 0x11,
|
||||
SQTT_MEMORY_TYPE_GDDR5 = 0x12,
|
||||
SQTT_MEMORY_TYPE_GDDR6 = 0x13,
|
||||
SQTT_MEMORY_TYPE_HBM = 0x20,
|
||||
SQTT_MEMORY_TYPE_HBM2 = 0x21,
|
||||
SQTT_MEMORY_TYPE_HBM3 = 0x22,
|
||||
SQTT_MEMORY_TYPE_LPDDR4 = 0x30,
|
||||
SQTT_MEMORY_TYPE_LPDDR5 = 0x31,
|
||||
};
|
||||
|
||||
struct sqtt_file_chunk_asic_info {
|
||||
struct sqtt_file_chunk_header header;
|
||||
uint64_t flags;
|
||||
uint64_t trace_shader_core_clock;
|
||||
uint64_t trace_memory_clock;
|
||||
int32_t device_id;
|
||||
int32_t device_revision_id;
|
||||
int32_t vgprs_per_simd;
|
||||
int32_t sgprs_per_simd;
|
||||
int32_t shader_engines;
|
||||
int32_t compute_unit_per_shader_engine;
|
||||
int32_t simd_per_compute_unit;
|
||||
int32_t wavefronts_per_simd;
|
||||
int32_t minimum_vgpr_alloc;
|
||||
int32_t vgpr_alloc_granularity;
|
||||
int32_t minimum_sgpr_alloc;
|
||||
int32_t sgpr_alloc_granularity;
|
||||
int32_t hardware_contexts;
|
||||
enum sqtt_gpu_type gpu_type;
|
||||
enum sqtt_gfxip_level gfxip_level;
|
||||
int32_t gpu_index;
|
||||
int32_t gds_size;
|
||||
int32_t gds_per_shader_engine;
|
||||
int32_t ce_ram_size;
|
||||
int32_t ce_ram_size_graphics;
|
||||
int32_t ce_ram_size_compute;
|
||||
int32_t max_number_of_dedicated_cus;
|
||||
int64_t vram_size;
|
||||
int32_t vram_bus_width;
|
||||
int32_t l2_cache_size;
|
||||
int32_t l1_cache_size;
|
||||
int32_t lds_size;
|
||||
char gpu_name[SQTT_GPU_NAME_MAX_SIZE];
|
||||
float alu_per_clock;
|
||||
float texture_per_clock;
|
||||
float prims_per_clock;
|
||||
float pixels_per_clock;
|
||||
uint64_t gpu_timestamp_frequency;
|
||||
uint64_t max_shader_core_clock;
|
||||
uint64_t max_memory_clock;
|
||||
uint32_t memory_ops_per_clock;
|
||||
enum sqtt_memory_type memory_chip_type;
|
||||
uint32_t lds_granularity;
|
||||
uint16_t cu_mask[SQTT_MAX_NUM_SE][SQTT_SA_PER_SE];
|
||||
char reserved1[128];
|
||||
uint32_t active_pixel_packer_mask[SQTT_ACTIVE_PIXEL_PACKER_MASK_DWORDS];
|
||||
char reserved2[16];
|
||||
uint32_t gl1_cache_size;
|
||||
uint32_t instruction_cache_size;
|
||||
uint32_t scalar_cache_size;
|
||||
uint32_t mall_cache_size;
|
||||
char padding[4];
|
||||
};
|
||||
|
||||
enum sqtt_api_type
|
||||
{
|
||||
SQTT_API_TYPE_DIRECTX_12,
|
||||
SQTT_API_TYPE_VULKAN,
|
||||
SQTT_API_TYPE_GENERIC,
|
||||
SQTT_API_TYPE_OPENCL
|
||||
};
|
||||
|
||||
enum sqtt_instruction_trace_mode
|
||||
{
|
||||
SQTT_INSTRUCTION_TRACE_DISABLED = 0x0,
|
||||
SQTT_INSTRUCTION_TRACE_FULL_FRAME = 0x1,
|
||||
SQTT_INSTRUCTION_TRACE_API_PSO = 0x2,
|
||||
};
|
||||
|
||||
enum sqtt_profiling_mode
|
||||
{
|
||||
SQTT_PROFILING_MODE_PRESENT = 0x0,
|
||||
SQTT_PROFILING_MODE_USER_MARKERS = 0x1,
|
||||
SQTT_PROFILING_MODE_INDEX = 0x2,
|
||||
SQTT_PROFILING_MODE_TAG = 0x3,
|
||||
};
|
||||
|
||||
union sqtt_profiling_mode_data {
|
||||
struct {
|
||||
char start[256];
|
||||
char end[256];
|
||||
} user_marker_profiling_data;
|
||||
|
||||
struct {
|
||||
uint32_t start;
|
||||
uint32_t end;
|
||||
} index_profiling_data;
|
||||
|
||||
struct {
|
||||
uint32_t begin_hi;
|
||||
uint32_t begin_lo;
|
||||
uint32_t end_hi;
|
||||
uint32_t end_lo;
|
||||
} tag_profiling_data;
|
||||
};
|
||||
|
||||
union sqtt_instruction_trace_data {
|
||||
struct {
|
||||
uint64_t api_pso_filter;
|
||||
} api_pso_data;
|
||||
|
||||
struct {
|
||||
uint32_t mask;
|
||||
} shader_engine_filter;
|
||||
};
|
||||
|
||||
struct sqtt_file_chunk_api_info {
|
||||
struct sqtt_file_chunk_header header;
|
||||
enum sqtt_api_type api_type;
|
||||
uint16_t major_version;
|
||||
uint16_t minor_version;
|
||||
enum sqtt_profiling_mode profiling_mode;
|
||||
uint32_t reserved;
|
||||
union sqtt_profiling_mode_data profiling_mode_data;
|
||||
enum sqtt_instruction_trace_mode instruction_trace_mode;
|
||||
uint32_t reserved2;
|
||||
union sqtt_instruction_trace_data instruction_trace_data;
|
||||
};
|
||||
|
||||
|
||||
struct sqtt_code_object_database_record {
|
||||
uint32_t size;
|
||||
};
|
||||
|
||||
struct sqtt_file_chunk_code_object_database {
|
||||
struct sqtt_file_chunk_header header;
|
||||
uint32_t offset;
|
||||
uint32_t flags;
|
||||
uint32_t size;
|
||||
uint32_t record_count;
|
||||
};
|
||||
|
||||
|
||||
struct sqtt_code_object_loader_events_record {
|
||||
uint32_t loader_event_type;
|
||||
uint32_t reserved;
|
||||
uint64_t base_address;
|
||||
uint64_t code_object_hash[2];
|
||||
uint64_t time_stamp;
|
||||
};
|
||||
|
||||
struct sqtt_file_chunk_code_object_loader_events {
|
||||
struct sqtt_file_chunk_header header;
|
||||
uint32_t offset;
|
||||
uint32_t flags;
|
||||
uint32_t record_size;
|
||||
uint32_t record_count;
|
||||
};
|
||||
|
||||
struct sqtt_pso_correlation_record {
|
||||
uint64_t api_pso_hash;
|
||||
uint64_t pipeline_hash[2];
|
||||
char api_level_obj_name[64];
|
||||
};
|
||||
|
||||
struct sqtt_file_chunk_pso_correlation {
|
||||
struct sqtt_file_chunk_header header;
|
||||
uint32_t offset;
|
||||
uint32_t flags;
|
||||
uint32_t record_size;
|
||||
uint32_t record_count;
|
||||
};
|
||||
|
||||
struct sqtt_file_chunk_sqtt_desc {
|
||||
struct sqtt_file_chunk_header header;
|
||||
int32_t shader_engine_index;
|
||||
enum sqtt_version sqtt_version;
|
||||
union {
|
||||
struct {
|
||||
int32_t instrumentation_version;
|
||||
} v0;
|
||||
struct {
|
||||
int16_t instrumentation_spec_version;
|
||||
int16_t instrumentation_api_version;
|
||||
int32_t compute_unit_index;
|
||||
} v1;
|
||||
};
|
||||
};
|
||||
|
||||
struct sqtt_file_chunk_sqtt_data {
|
||||
struct sqtt_file_chunk_header header;
|
||||
int32_t offset; /* in bytes */
|
||||
int32_t size; /* in bytes */
|
||||
};
|
||||
|
||||
struct sqtt_file_chunk_queue_event_timings {
|
||||
struct sqtt_file_chunk_header header;
|
||||
uint32_t queue_info_table_record_count;
|
||||
uint32_t queue_info_table_size;
|
||||
uint32_t queue_event_table_record_count;
|
||||
uint32_t queue_event_table_size;
|
||||
};
|
||||
|
||||
|
||||
enum sqtt_queue_type {
|
||||
SQTT_QUEUE_TYPE_UNKNOWN = 0x0,
|
||||
SQTT_QUEUE_TYPE_UNIVERSAL = 0x1,
|
||||
SQTT_QUEUE_TYPE_COMPUTE = 0x2,
|
||||
SQTT_QUEUE_TYPE_DMA = 0x3,
|
||||
};
|
||||
|
||||
enum sqtt_engine_type {
|
||||
SQTT_ENGINE_TYPE_UNKNOWN = 0x0,
|
||||
SQTT_ENGINE_TYPE_UNIVERSAL = 0x1,
|
||||
SQTT_ENGINE_TYPE_COMPUTE = 0x2,
|
||||
SQTT_ENGINE_TYPE_EXCLUSIVE_COMPUTE = 0x3,
|
||||
SQTT_ENGINE_TYPE_DMA = 0x4,
|
||||
SQTT_ENGINE_TYPE_HIGH_PRIORITY_UNIVERSAL = 0x7,
|
||||
SQTT_ENGINE_TYPE_HIGH_PRIORITY_GRAPHICS = 0x8,
|
||||
};
|
||||
|
||||
struct sqtt_queue_hardware_info {
|
||||
union {
|
||||
struct {
|
||||
int32_t queue_type : 8;
|
||||
int32_t engine_type : 8;
|
||||
uint32_t reserved : 16;
|
||||
};
|
||||
uint32_t value;
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
struct sqtt_queue_info_record {
|
||||
uint64_t queue_id;
|
||||
uint64_t queue_context;
|
||||
struct sqtt_queue_hardware_info hardware_info;
|
||||
uint32_t reserved;
|
||||
};
|
||||
|
||||
enum sqtt_queue_event_type {
|
||||
SQTT_QUEUE_TIMING_EVENT_CMDBUF_SUBMIT,
|
||||
SQTT_QUEUE_TIMING_EVENT_SIGNAL_SEMAPHORE,
|
||||
SQTT_QUEUE_TIMING_EVENT_WAIT_SEMAPHORE,
|
||||
SQTT_QUEUE_TIMING_EVENT_PRESENT
|
||||
};
|
||||
|
||||
struct sqtt_queue_event_record {
|
||||
enum sqtt_queue_event_type event_type;
|
||||
uint32_t sqtt_cb_id;
|
||||
uint64_t frame_index;
|
||||
uint32_t queue_info_index;
|
||||
uint32_t submit_sub_index;
|
||||
uint64_t api_id;
|
||||
uint64_t cpu_timestamp;
|
||||
uint64_t gpu_timestamps[2];
|
||||
};
|
||||
|
||||
struct sqtt_file_chunk_clock_calibration {
|
||||
struct sqtt_file_chunk_header header;
|
||||
uint64_t cpu_timestamp;
|
||||
uint64_t gpu_timestamp;
|
||||
uint64_t reserved;
|
||||
};
|
||||
|
||||
enum elf_gfxip_level
|
||||
{
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX801 = 0x028,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1010 = 0x033,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1030 = 0x036,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1100 = 0x041,
|
||||
};
|
||||
|
||||
struct sqtt_file_chunk_spm_db {
|
||||
struct sqtt_file_chunk_header header;
|
||||
uint32_t flags;
|
||||
uint32_t preamble_size;
|
||||
uint32_t num_timestamps;
|
||||
uint32_t num_spm_counter_info;
|
||||
uint32_t spm_counter_info_size;
|
||||
uint32_t sample_interval;
|
||||
};
|
||||
|
||||
/**
|
||||
* Identifiers for RGP SQ thread-tracing markers (Table 1)
|
||||
*/
|
||||
enum rgp_sqtt_marker_identifier
|
||||
{
|
||||
RGP_SQTT_MARKER_IDENTIFIER_EVENT = 0x0,
|
||||
RGP_SQTT_MARKER_IDENTIFIER_CB_START = 0x1,
|
||||
RGP_SQTT_MARKER_IDENTIFIER_CB_END = 0x2,
|
||||
RGP_SQTT_MARKER_IDENTIFIER_BARRIER_START = 0x3,
|
||||
RGP_SQTT_MARKER_IDENTIFIER_BARRIER_END = 0x4,
|
||||
RGP_SQTT_MARKER_IDENTIFIER_USER_EVENT = 0x5,
|
||||
RGP_SQTT_MARKER_IDENTIFIER_GENERAL_API = 0x6,
|
||||
RGP_SQTT_MARKER_IDENTIFIER_SYNC = 0x7,
|
||||
RGP_SQTT_MARKER_IDENTIFIER_PRESENT = 0x8,
|
||||
RGP_SQTT_MARKER_IDENTIFIER_LAYOUT_TRANSITION = 0x9,
|
||||
RGP_SQTT_MARKER_IDENTIFIER_RENDER_PASS = 0xA,
|
||||
RGP_SQTT_MARKER_IDENTIFIER_RESERVED2 = 0xB,
|
||||
RGP_SQTT_MARKER_IDENTIFIER_BIND_PIPELINE = 0xC,
|
||||
RGP_SQTT_MARKER_IDENTIFIER_RESERVED4 = 0xD,
|
||||
RGP_SQTT_MARKER_IDENTIFIER_RESERVED5 = 0xE,
|
||||
RGP_SQTT_MARKER_IDENTIFIER_RESERVED6 = 0xF
|
||||
};
|
||||
|
||||
/**
|
||||
* Command buffer IDs used in RGP SQ thread-tracing markers (only 20 bits).
|
||||
*/
|
||||
union rgp_sqtt_marker_cb_id {
|
||||
struct {
|
||||
uint32_t per_frame : 1; /* Must be 1, frame-based command buffer ID. */
|
||||
uint32_t frame_index : 7;
|
||||
uint32_t cb_index : 12; /* Command buffer index within the frame. */
|
||||
uint32_t reserved : 12;
|
||||
} per_frame_cb_id;
|
||||
|
||||
struct {
|
||||
uint32_t per_frame : 1; /* Must be 0, global command buffer ID. */
|
||||
uint32_t cb_index : 19; /* Global command buffer index. */
|
||||
uint32_t reserved : 12;
|
||||
} global_cb_id;
|
||||
|
||||
uint32_t all;
|
||||
};
|
||||
|
||||
/**
|
||||
* RGP SQ thread-tracing marker for the start of a command buffer. (Table 2)
|
||||
*/
|
||||
struct rgp_sqtt_marker_cb_start {
|
||||
union {
|
||||
struct {
|
||||
uint32_t identifier : 4;
|
||||
uint32_t ext_dwords : 3;
|
||||
uint32_t cb_id : 20;
|
||||
uint32_t queue : 5;
|
||||
};
|
||||
uint32_t dword01;
|
||||
};
|
||||
union {
|
||||
uint32_t device_id_low;
|
||||
uint32_t dword02;
|
||||
};
|
||||
union {
|
||||
uint32_t device_id_high;
|
||||
uint32_t dword03;
|
||||
};
|
||||
union {
|
||||
uint32_t queue_flags;
|
||||
uint32_t dword04;
|
||||
};
|
||||
};
|
||||
|
||||
/**
|
||||
*
|
||||
* RGP SQ thread-tracing marker for the end of a command buffer. (Table 3)
|
||||
*/
|
||||
struct rgp_sqtt_marker_cb_end {
|
||||
union {
|
||||
struct {
|
||||
uint32_t identifier : 4;
|
||||
uint32_t ext_dwords : 3;
|
||||
uint32_t cb_id : 20;
|
||||
uint32_t reserved : 5;
|
||||
};
|
||||
uint32_t dword01;
|
||||
};
|
||||
union {
|
||||
uint32_t device_id_low;
|
||||
uint32_t dword02;
|
||||
};
|
||||
union {
|
||||
uint32_t device_id_high;
|
||||
uint32_t dword03;
|
||||
};
|
||||
};
|
||||
|
||||
/**
|
||||
* API types used in RGP SQ thread-tracing markers for the "General API"
|
||||
* packet.
|
||||
*/
|
||||
enum rgp_sqtt_marker_general_api_type
|
||||
{
|
||||
ApiCmdBindPipeline = 0,
|
||||
ApiCmdBindDescriptorSets = 1,
|
||||
ApiCmdBindIndexBuffer = 2,
|
||||
ApiCmdBindVertexBuffers = 3,
|
||||
ApiCmdDraw = 4,
|
||||
ApiCmdDrawIndexed = 5,
|
||||
ApiCmdDrawIndirect = 6,
|
||||
ApiCmdDrawIndexedIndirect = 7,
|
||||
ApiCmdDrawIndirectCountAMD = 8,
|
||||
ApiCmdDrawIndexedIndirectCountAMD = 9,
|
||||
ApiCmdDispatch = 10,
|
||||
ApiCmdDispatchIndirect = 11,
|
||||
ApiCmdCopyBuffer = 12,
|
||||
ApiCmdCopyImage = 13,
|
||||
ApiCmdBlitImage = 14,
|
||||
ApiCmdCopyBufferToImage = 15,
|
||||
ApiCmdCopyImageToBuffer = 16,
|
||||
ApiCmdUpdateBuffer = 17,
|
||||
ApiCmdFillBuffer = 18,
|
||||
ApiCmdClearColorImage = 19,
|
||||
ApiCmdClearDepthStencilImage = 20,
|
||||
ApiCmdClearAttachments = 21,
|
||||
ApiCmdResolveImage = 22,
|
||||
ApiCmdWaitEvents = 23,
|
||||
ApiCmdPipelineBarrier = 24,
|
||||
ApiCmdBeginQuery = 25,
|
||||
ApiCmdEndQuery = 26,
|
||||
ApiCmdResetQueryPool = 27,
|
||||
ApiCmdWriteTimestamp = 28,
|
||||
ApiCmdCopyQueryPoolResults = 29,
|
||||
ApiCmdPushConstants = 30,
|
||||
ApiCmdBeginRenderPass = 31,
|
||||
ApiCmdNextSubpass = 32,
|
||||
ApiCmdEndRenderPass = 33,
|
||||
ApiCmdExecuteCommands = 34,
|
||||
ApiCmdSetViewport = 35,
|
||||
ApiCmdSetScissor = 36,
|
||||
ApiCmdSetLineWidth = 37,
|
||||
ApiCmdSetDepthBias = 38,
|
||||
ApiCmdSetBlendConstants = 39,
|
||||
ApiCmdSetDepthBounds = 40,
|
||||
ApiCmdSetStencilCompareMask = 41,
|
||||
ApiCmdSetStencilWriteMask = 42,
|
||||
ApiCmdSetStencilReference = 43,
|
||||
ApiCmdDrawIndirectCount = 44,
|
||||
ApiCmdDrawIndexedIndirectCount = 45,
|
||||
/* gap */
|
||||
ApiCmdDrawMeshTasksEXT = 47,
|
||||
ApiCmdDrawMeshTasksIndirectCountEXT = 48,
|
||||
ApiCmdDrawMeshTasksIndirectEXT = 49,
|
||||
|
||||
ApiRayTracingSeparateCompiled = 0x800000,
|
||||
ApiInvalid = 0xffffffff
|
||||
};
|
||||
|
||||
/**
|
||||
* RGP SQ thread-tracing marker for a "General API" instrumentation packet.
|
||||
*/
|
||||
struct rgp_sqtt_marker_general_api {
|
||||
union {
|
||||
struct {
|
||||
uint32_t identifier : 4;
|
||||
uint32_t ext_dwords : 3;
|
||||
uint32_t api_type : 20;
|
||||
uint32_t is_end : 1;
|
||||
uint32_t reserved : 4;
|
||||
};
|
||||
uint32_t dword01;
|
||||
};
|
||||
};
|
||||
|
||||
/**
|
||||
* API types used in RGP SQ thread-tracing markers (Table 16).
|
||||
*/
|
||||
enum rgp_sqtt_marker_event_type
|
||||
{
|
||||
EventCmdDraw = 0,
|
||||
EventCmdDrawIndexed = 1,
|
||||
EventCmdDrawIndirect = 2,
|
||||
EventCmdDrawIndexedIndirect = 3,
|
||||
EventCmdDrawIndirectCountAMD = 4,
|
||||
EventCmdDrawIndexedIndirectCountAMD = 5,
|
||||
EventCmdDispatch = 6,
|
||||
EventCmdDispatchIndirect = 7,
|
||||
EventCmdCopyBuffer = 8,
|
||||
EventCmdCopyImage = 9,
|
||||
EventCmdBlitImage = 10,
|
||||
EventCmdCopyBufferToImage = 11,
|
||||
EventCmdCopyImageToBuffer = 12,
|
||||
EventCmdUpdateBuffer = 13,
|
||||
EventCmdFillBuffer = 14,
|
||||
EventCmdClearColorImage = 15,
|
||||
EventCmdClearDepthStencilImage = 16,
|
||||
EventCmdClearAttachments = 17,
|
||||
EventCmdResolveImage = 18,
|
||||
EventCmdWaitEvents = 19,
|
||||
EventCmdPipelineBarrier = 20,
|
||||
EventCmdResetQueryPool = 21,
|
||||
EventCmdCopyQueryPoolResults = 22,
|
||||
EventRenderPassColorClear = 23,
|
||||
EventRenderPassDepthStencilClear = 24,
|
||||
EventRenderPassResolve = 25,
|
||||
EventInternalUnknown = 26,
|
||||
EventCmdDrawIndirectCount = 27,
|
||||
EventCmdDrawIndexedIndirectCount = 28,
|
||||
/* gap */
|
||||
EventCmdTraceRaysKHR = 30,
|
||||
EventCmdTraceRaysIndirectKHR = 31,
|
||||
EventCmdBuildAccelerationStructuresKHR = 32,
|
||||
EventCmdBuildAccelerationStructuresIndirectKHR = 33,
|
||||
EventCmdCopyAccelerationStructureKHR = 34,
|
||||
EventCmdCopyAccelerationStructureToMemoryKHR = 35,
|
||||
EventCmdCopyMemoryToAccelerationStructureKHR = 36,
|
||||
/* gap */
|
||||
EventCmdDrawMeshTasksEXT = 41,
|
||||
EventCmdDrawMeshTasksIndirectCountEXT = 42,
|
||||
EventCmdDrawMeshTasksIndirectEXT = 43,
|
||||
EventUnknown = 0x7fff,
|
||||
EventInvalid = 0xffffffff
|
||||
};
|
||||
|
||||
/**
|
||||
* "Event (Per-draw/dispatch)" RGP SQ thread-tracing marker. (Table 4)
|
||||
*/
|
||||
struct rgp_sqtt_marker_event {
|
||||
union {
|
||||
struct {
|
||||
uint32_t identifier : 4;
|
||||
uint32_t ext_dwords : 3;
|
||||
uint32_t api_type : 24;
|
||||
uint32_t has_thread_dims : 1;
|
||||
};
|
||||
uint32_t dword01;
|
||||
};
|
||||
union {
|
||||
struct {
|
||||
uint32_t cb_id : 20;
|
||||
uint32_t vertex_offset_reg_idx : 4;
|
||||
uint32_t instance_offset_reg_idx : 4;
|
||||
uint32_t draw_index_reg_idx : 4;
|
||||
};
|
||||
uint32_t dword02;
|
||||
};
|
||||
union {
|
||||
uint32_t cmd_id;
|
||||
uint32_t dword03;
|
||||
};
|
||||
};
|
||||
|
||||
/**
|
||||
* Per-dispatch specific marker where workgroup dims are included.
|
||||
*/
|
||||
struct rgp_sqtt_marker_event_with_dims {
|
||||
struct rgp_sqtt_marker_event event;
|
||||
uint32_t thread_x;
|
||||
uint32_t thread_y;
|
||||
uint32_t thread_z;
|
||||
};
|
||||
|
||||
/**
|
||||
* "Barrier Start" RGP SQTT instrumentation marker (Table 5)
|
||||
*/
|
||||
struct rgp_sqtt_marker_barrier_start {
|
||||
union {
|
||||
struct {
|
||||
uint32_t identifier : 4;
|
||||
uint32_t ext_dwords : 3;
|
||||
uint32_t cb_id : 20;
|
||||
uint32_t reserved : 5;
|
||||
};
|
||||
uint32_t dword01;
|
||||
};
|
||||
union {
|
||||
struct {
|
||||
uint32_t driver_reason : 31;
|
||||
uint32_t internal : 1;
|
||||
};
|
||||
uint32_t dword02;
|
||||
};
|
||||
};
|
||||
|
||||
/**
|
||||
* "Barrier End" RGP SQTT instrumentation marker (Table 6)
|
||||
*/
|
||||
struct rgp_sqtt_marker_barrier_end {
|
||||
union {
|
||||
struct {
|
||||
uint32_t identifier : 4;
|
||||
uint32_t ext_dwords : 3;
|
||||
uint32_t cb_id : 20;
|
||||
uint32_t wait_on_eop_ts : 1;
|
||||
uint32_t vs_partial_flush : 1;
|
||||
uint32_t ps_partial_flush : 1;
|
||||
uint32_t cs_partial_flush : 1;
|
||||
uint32_t pfp_sync_me : 1;
|
||||
};
|
||||
uint32_t dword01;
|
||||
};
|
||||
union {
|
||||
struct {
|
||||
uint32_t sync_cp_dma : 1;
|
||||
uint32_t inval_tcp : 1;
|
||||
uint32_t inval_sqI : 1;
|
||||
uint32_t inval_sqK : 1;
|
||||
uint32_t flush_tcc : 1;
|
||||
uint32_t inval_tcc : 1;
|
||||
uint32_t flush_cb : 1;
|
||||
uint32_t inval_cb : 1;
|
||||
uint32_t flush_db : 1;
|
||||
uint32_t inval_db : 1;
|
||||
uint32_t num_layout_transitions : 16;
|
||||
uint32_t inval_gl1 : 1;
|
||||
uint32_t wait_on_ts : 1;
|
||||
uint32_t eop_ts_bottom_of_pipe : 1;
|
||||
uint32_t eos_ts_ps_done : 1;
|
||||
uint32_t eos_ts_cs_done : 1;
|
||||
uint32_t reserved : 1;
|
||||
};
|
||||
uint32_t dword02;
|
||||
};
|
||||
};
|
||||
|
||||
/**
|
||||
* "Layout Transition" RGP SQTT instrumentation marker (Table 7)
|
||||
*/
|
||||
struct rgp_sqtt_marker_layout_transition {
|
||||
union {
|
||||
struct {
|
||||
uint32_t identifier : 4;
|
||||
uint32_t ext_dwords : 3;
|
||||
uint32_t depth_stencil_expand : 1;
|
||||
uint32_t htile_hiz_range_expand : 1;
|
||||
uint32_t depth_stencil_resummarize : 1;
|
||||
uint32_t dcc_decompress : 1;
|
||||
uint32_t fmask_decompress : 1;
|
||||
uint32_t fast_clear_eliminate : 1;
|
||||
uint32_t fmask_color_expand : 1;
|
||||
uint32_t init_mask_ram : 1;
|
||||
uint32_t reserved1 : 17;
|
||||
};
|
||||
uint32_t dword01;
|
||||
};
|
||||
union {
|
||||
struct {
|
||||
uint32_t reserved2 : 32;
|
||||
};
|
||||
uint32_t dword02;
|
||||
};
|
||||
};
|
||||
|
||||
/**
|
||||
* "User Event" RGP SQTT instrumentation marker (Table 8)
|
||||
*/
|
||||
struct rgp_sqtt_marker_user_event {
|
||||
union {
|
||||
struct {
|
||||
uint32_t identifier : 4;
|
||||
uint32_t reserved0 : 8;
|
||||
uint32_t data_type : 8;
|
||||
uint32_t reserved1 : 12;
|
||||
};
|
||||
uint32_t dword01;
|
||||
};
|
||||
};
|
||||
struct rgp_sqtt_marker_user_event_with_length {
|
||||
struct rgp_sqtt_marker_user_event user_event;
|
||||
uint32_t length;
|
||||
};
|
||||
|
||||
enum rgp_sqtt_marker_user_event_type
|
||||
{
|
||||
UserEventTrigger = 0,
|
||||
UserEventPop,
|
||||
UserEventPush,
|
||||
UserEventObjectName,
|
||||
};
|
||||
|
||||
/**
|
||||
* "Pipeline bind" RGP SQTT instrumentation marker (Table 12)
|
||||
*/
|
||||
struct rgp_sqtt_marker_pipeline_bind {
|
||||
union {
|
||||
struct {
|
||||
uint32_t identifier : 4;
|
||||
uint32_t ext_dwords : 3;
|
||||
uint32_t bind_point : 1;
|
||||
uint32_t cb_id : 20;
|
||||
uint32_t reserved : 4;
|
||||
};
|
||||
uint32_t dword01;
|
||||
};
|
||||
union {
|
||||
uint32_t api_pso_hash[2];
|
||||
struct {
|
||||
uint32_t dword02;
|
||||
uint32_t dword03;
|
||||
};
|
||||
};
|
||||
};
|
||||
@@ -57,6 +57,9 @@ class ProfileDeviceEvent(ProfileEvent):
|
||||
@dataclass(frozen=True)
|
||||
class ProfileRangeEvent(ProfileEvent): device:str; name:str; st:decimal.Decimal; en:decimal.Decimal; is_copy:bool # noqa: E702
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ProfileProgramEvent(ProfileEvent): device:str; name:str; lib:bytes|None; base:int|None # noqa: E702
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ProfileGraphEntry: device:str; name:str; st_id:int; en_id:int; is_copy:bool # noqa: E702
|
||||
|
||||
@@ -342,8 +345,9 @@ if PROFILE:
|
||||
|
||||
with open(fn:=temp("profile.pkl", append_user=True), "wb") as f: pickle.dump(Compiled.profile_events, f)
|
||||
|
||||
from tinygrad.ops import launch_viz
|
||||
launch_viz("PROFILE", fn)
|
||||
if not getenv("SQTT", 0):
|
||||
from tinygrad.ops import launch_viz
|
||||
launch_viz("PROFILE", fn)
|
||||
|
||||
if __name__ == "__main__":
|
||||
for device in ALL_DEVICES:
|
||||
|
||||
1789
tinygrad/runtime/autogen/sqtt.py
Normal file
1789
tinygrad/runtime/autogen/sqtt.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,15 +1,15 @@
|
||||
from __future__ import annotations
|
||||
from typing import Any, cast, ClassVar
|
||||
import os, ctypes, ctypes.util, functools, mmap, errno, array, contextlib, sys, select
|
||||
import os, ctypes, ctypes.util, struct, hashlib, functools, mmap, errno, array, contextlib, sys, select
|
||||
assert sys.platform != 'win32'
|
||||
from dataclasses import dataclass
|
||||
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQSignal, HCQProgram, HWInterface
|
||||
from tinygrad.ops import sint
|
||||
from tinygrad.device import BufferSpec, CPUProgram
|
||||
from tinygrad.device import Compiled, ProfileEvent, BufferSpec, CPUProgram, PROFILE
|
||||
from tinygrad.helpers import getenv, to_mv, round_up, data64_le, mv_address, DEBUG, OSX
|
||||
from tinygrad.renderer.cstyle import AMDRenderer
|
||||
from tinygrad.runtime.autogen import kfd, hsa, amd_gpu, libc, pci, vfio
|
||||
from tinygrad.runtime.autogen.am import am
|
||||
from tinygrad.runtime.autogen import kfd, hsa, amd_gpu, libc, pci, vfio, sqtt
|
||||
from tinygrad.runtime.autogen.am import am, gc_11_0_0
|
||||
from tinygrad.runtime.support.compiler_hip import AMDCompiler
|
||||
from tinygrad.runtime.support.elf import elf_loader
|
||||
from tinygrad.runtime.support.am.amdev import AMDev, AMMapping
|
||||
@@ -18,13 +18,21 @@ if getenv("IOCTL"): import extra.hip_gpu_driver.hip_ioctl # noqa: F401 # pylint
|
||||
regBIF_BX_PF1_GPU_HDP_FLUSH_REQ, regBIF_BX_PF1_GPU_HDP_FLUSH_DONE = 0x0106, 0x0107
|
||||
|
||||
EVENT_INDEX_PARTIAL_FLUSH = 4 # based on a comment in nvd.h
|
||||
WAIT_REG_MEM_FUNCTION_EQ = 3 # ==
|
||||
WAIT_REG_MEM_FUNCTION_NEQ = 4 # !=
|
||||
WAIT_REG_MEM_FUNCTION_GEQ = 5 # >=
|
||||
|
||||
COMPUTE_SHADER_EN, FORCE_START_AT_000, CS_W32_EN = (1 << 0), (1 << 2), (1 << 15)
|
||||
|
||||
def gfxreg(reg): return reg + amd_gpu.GC_BASE__INST0_SEG0 - amd_gpu.PACKET3_SET_SH_REG_START
|
||||
def ucfgreg(reg, pkt3_set:bool=True): return reg + amd_gpu.GC_BASE__INST0_SEG1 - (amd_gpu.PACKET3_SET_UCONFIG_REG_START if pkt3_set else 0)
|
||||
def nbioreg(reg): return reg + amd_gpu.NBIO_BASE__INST0_SEG2
|
||||
|
||||
# This can potentially be shared with AMRegister._parse_kwargs. NOTE: This is hardcoded to gfx11, bitfields might be different in other gfxvers.
|
||||
# Currently not a problem because this is only used by sqtt and sqtt is only supported on 7900xtx
|
||||
def encode_bitfields(regname: str, **kwargs) -> int:
|
||||
return functools.reduce(lambda x,y: x|y, [v << getattr(gc_11_0_0, f'{regname}__{k.upper()}__SHIFT') for k,v in kwargs.items()], 0)
|
||||
|
||||
class AMDSignal(HCQSignal):
|
||||
def __init__(self, base_addr:int|None=None, **kwargs):
|
||||
super().__init__(base_addr, **kwargs, timestamp_divider=100, dev_t=AMDDevice)
|
||||
@@ -40,6 +48,11 @@ class AMDComputeQueue(HWQueue):
|
||||
|
||||
def pkt3(self, cmd, *vals): self.q(amd_gpu.PACKET3(cmd, len(vals) - 1), *vals)
|
||||
|
||||
def sqtt_userdata(self, data, *extra_dwords):
|
||||
data_ints = [x[0] for x in struct.iter_unpack('<I', bytes(data))] + list(extra_dwords)
|
||||
for i in range(0, len(data_ints), 2):
|
||||
self.pkt3(amd_gpu.PACKET3_SET_UCONFIG_REG, ucfgreg(amd_gpu.regSQ_THREAD_TRACE_USERDATA_2), *data_ints[i:i+2])
|
||||
|
||||
def wait_reg_mem(self, value, mask=0xffffffff, mem=None, reg_req=None, reg_done=None):
|
||||
wrm_info_dw = amd_gpu.WAIT_REG_MEM_MEM_SPACE(int(mem is not None)) | amd_gpu.WAIT_REG_MEM_OPERATION(int(mem is None)) \
|
||||
| amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_GEQ) | amd_gpu.WAIT_REG_MEM_ENGINE(0)
|
||||
@@ -72,6 +85,83 @@ class AMDComputeQueue(HWQueue):
|
||||
self.acquire_mem()
|
||||
return self
|
||||
|
||||
def spi_config(self, tracing:bool):
|
||||
spi_config_cntl = encode_bitfields('SPI_CONFIG_CNTL', ps_pkr_priority_cntl=3, exp_priority_order=3, gpr_write_priority=0x2c688,
|
||||
enable_sqg_bop_events=int(tracing), enable_sqg_top_events=int(tracing))
|
||||
self.pkt3(amd_gpu.PACKET3_SET_UCONFIG_REG, ucfgreg(amd_gpu.regSPI_CONFIG_CNTL), spi_config_cntl)
|
||||
|
||||
def sqtt_config(self, tracing:bool):
|
||||
sq_thread_trace_ctrl = encode_bitfields('SQ_THREAD_TRACE_CTRL', draw_event_en=1, spi_stall_en=1, sq_stall_en=1, reg_at_hwm=2, hiwater=1,
|
||||
rt_freq=amd_gpu.SQ_TT_RT_FREQ_4096_CLK, util_timer=amd_gpu.SQ_TT_UTIL_TIMER_250_CLK, mode=int(tracing))
|
||||
self.pkt3(amd_gpu.PACKET3_SET_UCONFIG_REG, ucfgreg(amd_gpu.regSQ_THREAD_TRACE_CTRL), sq_thread_trace_ctrl)
|
||||
|
||||
def grbm_gfx_index(self, **kwargs):
|
||||
self.pkt3(amd_gpu.PACKET3_SET_UCONFIG_REG, ucfgreg(amd_gpu.regGRBM_GFX_INDEX), encode_bitfields('GRBM_GFX_INDEX', **kwargs))
|
||||
|
||||
# Magic values from mesa/src/amd/vulkan/radv_sqtt.c:radv_emit_spi_config_cntl and src/amd/common/ac_sqtt.c:ac_sqtt_emit_start
|
||||
def start_trace(self, buf0s:list[HCQBuffer], se_mask:int):
|
||||
self.memory_barrier()
|
||||
self.spi_config(tracing=True)
|
||||
# One buffer for one SE, mesa does it with a single buffer and ac_sqtt_get_data_offset, but this is simpler and should work just as well
|
||||
for se in range(len(buf0s)):
|
||||
self.grbm_gfx_index(se_index=se, instance_broadcast_writes=1) # select se, broadcast to all instances in that se
|
||||
buf0_lo, buf0_hi = data64_le(buf0s[se].va_addr>>12)
|
||||
self.pkt3(amd_gpu.PACKET3_SET_UCONFIG_REG, ucfgreg(amd_gpu.regSQ_THREAD_TRACE_BUF0_SIZE),
|
||||
encode_bitfields('SQ_THREAD_TRACE_BUF0_SIZE', base_hi=buf0_hi, size=buf0s[se].size>>12))
|
||||
self.pkt3(amd_gpu.PACKET3_SET_UCONFIG_REG, ucfgreg(amd_gpu.regSQ_THREAD_TRACE_BUF0_BASE), buf0_lo)
|
||||
# NOTE: SQTT can only trace instructions on one simd per se, this selects first simd in first wgp in first sa.
|
||||
# For RGP to display instruction trace it has to see it on first SE. Howerver ACE/MEC/whatever does the dispatching starting with second se,
|
||||
# and on amdgpu/non-AM it also does weird things with dispatch order inside se: around 7 times out of 10 it starts from the last cu, but
|
||||
# sometimes not, especially if the kernel has more than one wavefront which means that kernels with small global size might get unlucky and
|
||||
# be dispatched on something else and not be seen in instruction tracing tab. You can force the wavefronts of a kernel to be dispatched on the
|
||||
# CUs you want to by disabling other CUs via bits in regCOMPUTE_STATIC_THREAD_MGMT_SE<x> and trace even kernels that only have one wavefront.
|
||||
self.pkt3(amd_gpu.PACKET3_SET_UCONFIG_REG, ucfgreg(amd_gpu.regSQ_THREAD_TRACE_MASK),
|
||||
encode_bitfields('SQ_THREAD_TRACE_MASK', wtype_include=amd_gpu.SQ_TT_WTYPE_INCLUDE_CS_BIT, simd_sel=0, wgp_sel=0, sa_sel=0))
|
||||
REG_INCLUDE = amd_gpu.SQ_TT_TOKEN_MASK_SQDEC_BIT | amd_gpu.SQ_TT_TOKEN_MASK_SHDEC_BIT | amd_gpu.SQ_TT_TOKEN_MASK_GFXUDEC_BIT | \
|
||||
amd_gpu.SQ_TT_TOKEN_MASK_COMP_BIT | amd_gpu.SQ_TT_TOKEN_MASK_CONTEXT_BIT | amd_gpu.SQ_TT_TOKEN_MASK_CONTEXT_BIT
|
||||
TOKEN_EXCLUDE = 1 << amd_gpu.SQ_TT_TOKEN_EXCLUDE_PERF_SHIFT
|
||||
if not (se_mask >> se) & 0b1:
|
||||
TOKEN_EXCLUDE |= 1 << amd_gpu.SQ_TT_TOKEN_EXCLUDE_VMEMEXEC_SHIFT | 1 << amd_gpu.SQ_TT_TOKEN_EXCLUDE_ALUEXEC_SHIFT | \
|
||||
1 << amd_gpu.SQ_TT_TOKEN_EXCLUDE_VALUINST_SHIFT | 1 << amd_gpu.SQ_TT_TOKEN_EXCLUDE_IMMEDIATE_SHIFT | \
|
||||
1 << amd_gpu.SQ_TT_TOKEN_EXCLUDE_INST_SHIFT
|
||||
self.pkt3(amd_gpu.PACKET3_SET_UCONFIG_REG, ucfgreg(amd_gpu.regSQ_THREAD_TRACE_TOKEN_MASK),
|
||||
encode_bitfields('SQ_THREAD_TRACE_TOKEN_MASK', reg_include=REG_INCLUDE, token_exclude=TOKEN_EXCLUDE, bop_events_token_include=1))
|
||||
# Enable SQTT
|
||||
self.sqtt_config(tracing=True)
|
||||
# Restore global broadcasting
|
||||
self.grbm_gfx_index(se_broadcast_writes=1, sa_broadcast_writes=1, instance_broadcast_writes=1)
|
||||
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_THREAD_TRACE_ENABLE), 1)
|
||||
self.memory_barrier()
|
||||
return self
|
||||
|
||||
# Magic values from src/amd/common/ac_sqtt.c:ac_sqtt_emit_stop and src/amd/common/ac_sqtt.c:ac_sqtt_emit_wait
|
||||
def stop_trace(self, ses: int, wptrs: HCQBuffer):
|
||||
self.memory_barrier()
|
||||
# Start shutting everything down
|
||||
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_THREAD_TRACE_ENABLE), 0)
|
||||
self.pkt3(amd_gpu.PACKET3_EVENT_WRITE, amd_gpu.EVENT_TYPE(amd_gpu.THREAD_TRACE_FINISH) | amd_gpu.EVENT_INDEX(0))
|
||||
# For each SE wait for finish to complete and copy regSQ_THREAD_TRACE_WPTR to know where in the buffer trace data ends
|
||||
for se in range(ses):
|
||||
self.grbm_gfx_index(se_index=se, instance_broadcast_writes=1) # select se, broadcast to all instances in that se
|
||||
# Wait for FINISH_PENDING==0
|
||||
self.pkt3(amd_gpu.PACKET3_WAIT_REG_MEM, amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_EQ),
|
||||
ucfgreg(amd_gpu.regSQ_THREAD_TRACE_STATUS, False), 0, 0, gc_11_0_0.SQ_THREAD_TRACE_STATUS__FINISH_PENDING_MASK, 4)
|
||||
# Wait for FINISH_DONE!=0
|
||||
self.pkt3(amd_gpu.PACKET3_WAIT_REG_MEM, amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_NEQ),
|
||||
ucfgreg(amd_gpu.regSQ_THREAD_TRACE_STATUS, False), 0, 0, gc_11_0_0.SQ_THREAD_TRACE_STATUS__FINISH_DONE_MASK, 4)
|
||||
# Disable SQTT
|
||||
self.sqtt_config(tracing=False)
|
||||
# Wait for BUSY==0
|
||||
self.pkt3(amd_gpu.PACKET3_WAIT_REG_MEM, amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_EQ),
|
||||
ucfgreg(amd_gpu.regSQ_THREAD_TRACE_STATUS, False), 0, 0, gc_11_0_0.SQ_THREAD_TRACE_STATUS__BUSY_MASK, 4)
|
||||
# Copy WPTR to memory (src_sel = perf, dst_sel = tc_l2, wr_confirm = True), ucfgreg with False adds GC_BASE__INST0_SEG1 but not pkt3 reg offset
|
||||
self.pkt3(amd_gpu.PACKET3_COPY_DATA, 1 << 20 | 2 << 8 | 4, ucfgreg(amd_gpu.regSQ_THREAD_TRACE_WPTR, False), 0, *data64_le(wptrs.va_addr+(se*4)))
|
||||
# Restore global broadcasting
|
||||
self.grbm_gfx_index(se_broadcast_writes=1, sa_broadcast_writes=1, instance_broadcast_writes=1)
|
||||
self.spi_config(tracing=False)
|
||||
self.memory_barrier()
|
||||
return self
|
||||
|
||||
def exec(self, prg:AMDProgram, args_state:CLikeArgsState, global_size:tuple[sint, ...], local_size:tuple[sint, ...]):
|
||||
self.bind_args_state(args_state)
|
||||
|
||||
@@ -93,6 +183,20 @@ class AMDComputeQueue(HWQueue):
|
||||
|
||||
user_regs += [*data64_le(args_state.ptr)]
|
||||
|
||||
if prg.dev.sqtt_enabled:
|
||||
self.sqtt_userdata(sqtt.struct_rgp_sqtt_marker_pipeline_bind(
|
||||
_0=sqtt.union_rgp_sqtt_marker_pipeline_bind_0(_0=sqtt.struct_rgp_sqtt_marker_pipeline_bind_0_0(
|
||||
identifier=sqtt.RGP_SQTT_MARKER_IDENTIFIER_BIND_PIPELINE,
|
||||
bind_point=1, # compute
|
||||
)),
|
||||
_1=sqtt.union_rgp_sqtt_marker_pipeline_bind_1(api_pso_hash=data64_le(prg.libhash[0])),
|
||||
))
|
||||
self.sqtt_userdata(sqtt.struct_rgp_sqtt_marker_event(
|
||||
_0=sqtt.union_rgp_sqtt_marker_event_0(_0=sqtt.struct_rgp_sqtt_marker_event_0_0(has_thread_dims=1)),
|
||||
_2=sqtt.union_rgp_sqtt_marker_event_2(cmd_id=prg.dev.cmd_id),
|
||||
), *global_size)
|
||||
prg.dev.cmd_id += 1
|
||||
|
||||
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_PGM_LO), *data64_le(prg.prog_addr >> 8))
|
||||
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_PGM_RSRC1), prg.rsrc1, prg.rsrc2)
|
||||
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_PGM_RSRC3), 0)
|
||||
@@ -110,6 +214,7 @@ class AMDComputeQueue(HWQueue):
|
||||
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_RESOURCE_LIMITS), 0)
|
||||
|
||||
self.pkt3(amd_gpu.PACKET3_DISPATCH_DIRECT, *global_size, CS_W32_EN | FORCE_START_AT_000 | COMPUTE_SHADER_EN)
|
||||
if prg.dev.sqtt_enabled: self.pkt3(amd_gpu.PACKET3_EVENT_WRITE, amd_gpu.EVENT_TYPE(amd_gpu.THREAD_TRACE_MARKER) | amd_gpu.EVENT_INDEX(0))
|
||||
self.pkt3(amd_gpu.PACKET3_EVENT_WRITE, amd_gpu.EVENT_TYPE(amd_gpu.CS_PARTIAL_FLUSH) | amd_gpu.EVENT_INDEX(EVENT_INDEX_PARTIAL_FLUSH))
|
||||
return self
|
||||
|
||||
@@ -268,7 +373,10 @@ class AMDProgram(HCQProgram):
|
||||
self.enable_private_segment_sgpr: int = code.kernel_code_properties & hsa.AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER
|
||||
additional_alloc_sz = ctypes.sizeof(hsa.hsa_kernel_dispatch_packet_t) if self.enable_dispatch_ptr else 0
|
||||
|
||||
super().__init__(CLikeArgsState, self.dev, self.name, kernargs_alloc_size=self.kernargs_segment_size+additional_alloc_sz)
|
||||
if dev.sqtt_enabled: self.libhash: tuple[int, int] = struct.unpack('<Q', hashlib.md5(self.lib).digest()[:8])*2
|
||||
|
||||
super().__init__(CLikeArgsState, self.dev, self.name, kernargs_alloc_size=self.kernargs_segment_size+additional_alloc_sz, lib=self.lib,
|
||||
base=self.lib_gpu.va_addr)
|
||||
|
||||
def __del__(self):
|
||||
if hasattr(self, 'lib_gpu'): self.dev.allocator.free(self.lib_gpu, self.lib_gpu.size, BufferSpec(cpu_access=True, nolru=True))
|
||||
@@ -285,6 +393,9 @@ class AMDAllocator(HCQAllocator['AMDDevice']):
|
||||
|
||||
MAP_FIXED, MAP_NORESERVE, MAP_LOCKED = 0x10, 0x400, 0 if OSX else 0x2000
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ProfileSQTTEvent(ProfileEvent): device:str; se:int; blob:bytes; itrace:bool # noqa: E702
|
||||
|
||||
@dataclass
|
||||
class AMDQueueDesc:
|
||||
ring: memoryview
|
||||
@@ -600,6 +711,21 @@ class AMDDevice(HCQCompiled):
|
||||
self.max_private_segment_size = 0
|
||||
self._ensure_has_local_memory(128) # set default scratch size to 128 bytes per thread
|
||||
|
||||
# SQTT is disabled by default because of runtime overhead and big file sizes (~200mb to Tensor.full() two 4096x4096 tensors and matmul them)
|
||||
self.sqtt_enabled = PROFILE and bool(getenv("SQTT", 0))
|
||||
if self.sqtt_enabled:
|
||||
if self.arch != 'gfx1100': raise RuntimeError('SQ Thread Tracing is only supported on 7900XTX')
|
||||
if not self.driverless and (ppfeaturemask:=int(HWInterface('/sys/module/amdgpu/parameters/ppfeaturemask', os.O_RDONLY).read(), 16)) & 0x8000:
|
||||
raise RuntimeError("SQTT can't be enabled because of hardware bug, to workaround either use driverless or add "
|
||||
f"ppfeaturemask={(ppfeaturemask&~0x8000):#x} (current {ppfeaturemask=:#x} & ~PP_GFXOFF_MASK) to amdgpu module parameters\n"
|
||||
"For more information read https://github.com/tinygrad/tinygrad/blob/master/extra/sqtt/README.md")
|
||||
SQTT_BUFFER_SIZE = getenv("SQTT_BUFFER_SIZE", 256) # in mb, per shader engine
|
||||
SQTT_NUM = self.dev_iface.props['array_count'] // self.dev_iface.props['simd_arrays_per_engine']
|
||||
self.sqtt_buffers = [self.allocator.alloc(SQTT_BUFFER_SIZE*1024*1024, BufferSpec(cpu_access=True, nolru=True)) for _ in range(SQTT_NUM)]
|
||||
self.sqtt_itrace_se_mask = getenv("SQTT_ITRACE_SE_MASK", 2) # -1 enable all, 0 disable all, >0 bitmask for where to enable instruction tracing
|
||||
self.cmd_id = 0
|
||||
AMDComputeQueue().start_trace(self.sqtt_buffers, self.sqtt_itrace_se_mask).submit(self)
|
||||
|
||||
def create_queue(self, queue_type, ring_size, ctx_save_restore_size=0, eop_buffer_size=0, ctl_stack_size=0, debug_memory_size=0):
|
||||
ring = self.dev_iface.alloc(ring_size, uncached=True, cpu_access=True)
|
||||
gart = self.dev_iface.alloc(0x1000, uncached=True, cpu_access=True)
|
||||
@@ -629,6 +755,24 @@ class AMDDevice(HCQCompiled):
|
||||
|
||||
def on_device_hang(self): self.dev_iface.on_device_hang()
|
||||
|
||||
def _at_profile_finalize(self):
|
||||
if self.sqtt_enabled:
|
||||
wptrs_buf = self.allocator.alloc(round_up(len(self.sqtt_buffers), 0x1000), BufferSpec(cpu_access=True, nolru=True))
|
||||
wptrs = to_mv(wptrs_buf.va_addr, wptrs_buf.size)
|
||||
AMDComputeQueue().stop_trace(len(self.sqtt_buffers), wptrs_buf).signal(self.timeline_signal, self.timeline_value).submit(self)
|
||||
self.timeline_value += 1
|
||||
self.synchronize()
|
||||
if DEBUG>=2: print('Saving SQTT in profile...')
|
||||
for i,buf0 in enumerate(self.sqtt_buffers):
|
||||
wptr = ((struct.unpack('<I', wptrs[i*4:i*4+4])[0] & 0x1FFFFFFF) - ((buf0.va_addr//32) & 0x1FFFFFFF)) * 32
|
||||
if DEBUG>=2: print(f'Se {i} blob size {wptr:#x}')
|
||||
assert wptr >= 0 and wptr <= buf0.size, f"{wptr} > {buf0.size}, should never happen"
|
||||
# When sqtt buffer overflows, wptr stops at the last dword
|
||||
if wptr >= buf0.size-32: print(f"WARNING: SQTT BUFFER IS FULL (SE {i})! INCREASE SQTT BUFFER SIZE WITH SQTT_BUFFER_SIZE=X (in MB)")
|
||||
self.allocator._copyout(sqtt_buf:=memoryview(bytearray(wptr)), buf0)
|
||||
Compiled.profile_events += [ProfileSQTTEvent(self.device, i, bytes(sqtt_buf), bool((self.sqtt_itrace_se_mask >> i) & 0b1))]
|
||||
super()._at_profile_finalize()
|
||||
|
||||
def finalize(self):
|
||||
self.synchronize()
|
||||
if hasattr(self.dev_iface, 'device_fini'): self.dev_iface.device_fini()
|
||||
|
||||
@@ -3,7 +3,7 @@ from typing import cast, Type, TypeVar, Generic, Any, ClassVar
|
||||
import contextlib, decimal, statistics, time, ctypes, array, os, fcntl
|
||||
from tinygrad.helpers import PROFILE, from_mv, getenv, to_mv, round_up
|
||||
from tinygrad.renderer import Renderer
|
||||
from tinygrad.device import BufferSpec, Compiler, Compiled, LRUAllocator, ProfileRangeEvent, ProfileDeviceEvent
|
||||
from tinygrad.device import BufferSpec, Compiler, Compiled, LRUAllocator, ProfileRangeEvent, ProfileDeviceEvent, ProfileProgramEvent
|
||||
from tinygrad.ops import sym_infer, sint, Variable, UOp
|
||||
from tinygrad.runtime.autogen import libc
|
||||
|
||||
@@ -290,8 +290,9 @@ class CLikeArgsState(HCQArgsState[ProgramType]):
|
||||
self.bind_sints_to_ptr(*vals, ptr=self.ptr + len(prefix or []) * 4 + len(bufs) * 8, fmt='I')
|
||||
|
||||
class HCQProgram(Generic[DeviceType]):
|
||||
def __init__(self, args_state_t:Type[HCQArgsState], dev:DeviceType, name:str, kernargs_alloc_size:int):
|
||||
def __init__(self, args_state_t:Type[HCQArgsState], dev:DeviceType, name:str, kernargs_alloc_size:int, lib:bytes|None=None, base:int|None=None):
|
||||
self.args_state_t, self.dev, self.name, self.kernargs_alloc_size = args_state_t, dev, name, kernargs_alloc_size
|
||||
if PROFILE: Compiled.profile_events += [ProfileProgramEvent(dev.device, name, lib, base)]
|
||||
|
||||
def fill_kernargs(self, bufs:tuple[HCQBuffer, ...], vals:tuple[int, ...]=(), kernargs_ptr:int|None=None) -> HCQArgsState:
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user