From d7330ea6ade338087f2ef819bc7516663cbf48fb Mon Sep 17 00:00:00 2001 From: nimlgen <138685161+nimlgen@users.noreply.github.com> Date: Thu, 10 Apr 2025 00:39:45 +0300 Subject: [PATCH] amd: refactor sqtt into sep functions (#9816) * amd: refactor sqtt into sep functions * fix --- tinygrad/runtime/ops_amd.py | 64 +++++++++++++++++++++---------------- 1 file changed, 37 insertions(+), 27 deletions(-) diff --git a/tinygrad/runtime/ops_amd.py b/tinygrad/runtime/ops_amd.py index 7eca089c12..412c5e9337 100644 --- a/tinygrad/runtime/ops_amd.py +++ b/tinygrad/runtime/ops_amd.py @@ -59,11 +59,6 @@ class AMDComputeQueue(HWQueue): if self.dev.xccs > 1: self._q[prev_len-1] |= (len(self._q) - prev_len) - def sqtt_userdata(self, data, *extra_dwords): - data_ints = [x[0] for x in struct.iter_unpack('> 8)) self.wreg(self.gc.regCOMPUTE_PGM_RSRC1, prg.rsrc1, prg.rsrc2) self.wreg(self.gc.regCOMPUTE_PGM_RSRC3, prg.rsrc3) self.wreg(self.gc.regCOMPUTE_TMPRING_SIZE, prg.dev.tmpring_size) + if prg.dev.has_scratch_base_registers: for xcc_id in range(self.dev.xccs): with self.pred_exec(xcc_mask=1<> 8)) + if (10,0,0) <= prg.dev.target < (11,0,0): self.wreg(self.gc.mmCP_COHER_START_DELAY, 0x20) + self.wreg(self.gc.regCOMPUTE_RESTART_X, 0, 0, 0) self.wreg(self.gc.regCOMPUTE_STATIC_THREAD_MGMT_SE0, 0xFFFFFFFF, 0xFFFFFFFF) self.wreg(self.gc.regCOMPUTE_STATIC_THREAD_MGMT_SE2, 0xFFFFFFFF, 0xFFFFFFFF) - if prg.dev.target >= (11,0,0): - self.wreg(self.gc.regCOMPUTE_STATIC_THREAD_MGMT_SE4, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF) + if prg.dev.target >= (11,0,0): self.wreg(self.gc.regCOMPUTE_STATIC_THREAD_MGMT_SE4, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF) + self.wreg(self.gc.regCOMPUTE_USER_DATA_0, *user_regs) + self.wreg(self.gc.regCOMPUTE_RESOURCE_LIMITS, 0) self.wreg(self.gc.regCOMPUTE_START_X, 0, 0, 0, *local_size, 0, 0) - self.wreg(self.gc.regCOMPUTE_RESOURCE_LIMITS, 0) gfx10p = {'cs_w32_en': int(prg.wave32)} if prg.dev.target >= (10,0,0) else {} DISPATCH_INITIATOR = self.gc.regCOMPUTE_DISPATCH_INITIATOR.encode(**gfx10p, force_start_at_000=1, compute_shader_en=1) self.pkt3(self.pm4.PACKET3_DISPATCH_DIRECT, *global_size, DISPATCH_INITIATOR) + if prg.dev.sqtt_enabled: self.pkt3(self.pm4.PACKET3_EVENT_WRITE, self.pm4.EVENT_TYPE(self.soc.THREAD_TRACE_MARKER) | self.pm4.EVENT_INDEX(0)) self.pkt3(self.pm4.PACKET3_EVENT_WRITE, self.pm4.EVENT_TYPE(self.soc.CS_PARTIAL_FLUSH) | self.pm4.EVENT_INDEX(EVENT_INDEX_PARTIAL_FLUSH)) + if self.dev.xccs > 1: self.release_mem(cache_flush=True) self.acquire_mem(gli=0) @@ -850,7 +860,7 @@ class AMDDevice(HCQCompiled): self.sqtt_buffers = [self.allocator.alloc(SQTT_BUFFER_SIZE*1024*1024, BufferSpec(cpu_access=True, nolru=True)) for _ in range(SQTT_NUM)] self.sqtt_itrace_se_mask = getenv("SQTT_ITRACE_SE_MASK", 2) # -1 enable all, 0 disable all, >0 bitmask for where to enable instruction tracing self.cmd_id = 0 - AMDComputeQueue(self).start_trace(self.sqtt_buffers, self.sqtt_itrace_se_mask).submit(self) + AMDComputeQueue(self).sqtt_start(self.sqtt_buffers, self.sqtt_itrace_se_mask).submit(self) def create_queue(self, queue_type, ring_size, ctx_save_restore_size=0, eop_buffer_size=0, ctl_stack_size=0, debug_memory_size=0): ring = self.dev_iface.alloc(ring_size, uncached=True, cpu_access=True) @@ -888,7 +898,7 @@ class AMDDevice(HCQCompiled): if self.sqtt_enabled: wptrs_buf = self.allocator.alloc(round_up(len(self.sqtt_buffers), 0x1000), BufferSpec(cpu_access=True, nolru=True)) wptrs = to_mv(wptrs_buf.va_addr, wptrs_buf.size) - AMDComputeQueue(self).stop_trace(len(self.sqtt_buffers), wptrs_buf).signal(self.timeline_signal, self.next_timeline()).submit(self) + AMDComputeQueue(self).sqtt_stop(len(self.sqtt_buffers), wptrs_buf).signal(self.timeline_signal, self.next_timeline()).submit(self) self.synchronize() if DEBUG>=2: print('Saving SQTT in profile...') for i,buf0 in enumerate(self.sqtt_buffers):