mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-06-11 23:46:02 +08:00
am: large allocs aligned to 2mb to use 2mb pages (#15609)
This commit is contained in:
2
test/external/external_test_am.py
vendored
2
test/external/external_test_am.py
vendored
@@ -27,7 +27,7 @@ class FakeAM:
|
||||
self.gmc = FakeGMC(self)
|
||||
self.mm = AMMemoryManager(self, self.vram_size, boot_size=(32 << 20), pt_t=AMPageTableEntry, va_shifts=[12, 21, 30, 39], va_bits=48,
|
||||
first_lv=am.AMDGPU_VM_PDB2, va_base=AMMemoryManager.va_allocator.base,
|
||||
palloc_ranges=[(1 << (i + 12), 0x1000) for i in range(9 * (3 - am.AMDGPU_VM_PDB2), -1, -1)])
|
||||
palloc_ranges=[(1 << (i + 12), (2 << 20) if i >= 9 else 0x1000) for i in range(9 * (3 - am.AMDGPU_VM_PDB2), -1, -1)])
|
||||
self.is_booting = False
|
||||
self.ip_ver = {am.GC_HWIP: (11, 0, 0)}
|
||||
def paddr2cpu(self, paddr:int) -> int: return paddr + mv_address(self.vram)
|
||||
|
||||
@@ -548,12 +548,6 @@ class PCIIface(PCIIfaceBase):
|
||||
self.gpfifo_class, self.compute_class, self.dma_class = (gsp:=self.dev_impl.gsp).gpfifo_class, gsp.compute_class, gsp.dma_class
|
||||
self.viddec_class = None
|
||||
|
||||
def alloc(self, size:int, host=False, uncached=False, cpu_access=False, contiguous=False, force_devmem=False, **kwargs) -> HCQBuffer:
|
||||
# Force use of huge pages for large allocations. NVDev will attempt to use huge pages in any case,
|
||||
# but if the size is not aligned, the tail will be allocated with 4KB pages, increasing TLB pressure.
|
||||
return super().alloc(round_up(size, mmap.PAGESIZE if uncached or host else ((2 << 20) if size >= (8 << 20) else (4 << 10))),
|
||||
host=host, uncached=uncached, cpu_access=cpu_access, contiguous=contiguous, force_devmem=force_devmem, **kwargs)
|
||||
|
||||
def setup_usermode(self): return 0xce000000, self.pci_dev.map_bar(bar=0, fmt='I', off=0xbb0000, size=0x10000)
|
||||
def setup_vm(self, vaspace): pass
|
||||
def setup_gpfifo_vm(self, gpfifo): pass
|
||||
|
||||
@@ -198,8 +198,8 @@ class AMDev:
|
||||
|
||||
# Memory manager & firmware
|
||||
self.mm = AMMemoryManager(self, self.vram_size - self.reserved_vram_size, boot_size=(32 << 20), pt_t=AMPageTableEntry, va_shifts=[12, 21, 30, 39],
|
||||
va_bits=48, first_lv=am.AMDGPU_VM_PDB2, va_base=AMMemoryManager.va_allocator.base,
|
||||
palloc_ranges=[(1 << (i + 12), 0x1000) for i in range(9 * (3 - am.AMDGPU_VM_PDB2), -1, -1)], reserve_ptable=not self.large_bar)
|
||||
va_bits=48, first_lv=am.AMDGPU_VM_PDB2, va_base=AMMemoryManager.va_allocator.base, reserve_ptable=not self.large_bar,
|
||||
palloc_ranges=[(1 << (i + 12), (2 << 20) if i >= 9 else 0x1000) for i in range(9 * (3 - am.AMDGPU_VM_PDB2), -1, -1)])
|
||||
self.fw = AMFirmware(self)
|
||||
|
||||
# Initialize IP blocks
|
||||
|
||||
@@ -248,6 +248,10 @@ class PCIIfaceBase:
|
||||
|
||||
def alloc(self, size:int, host=False, uncached=False, cpu_access=False, contiguous=False, force_devmem=False, **kwargs) -> HCQBuffer:
|
||||
should_use_sysmem = host or ((cpu_access if self.is_bar_small() else (uncached and cpu_access)) and not force_devmem)
|
||||
|
||||
# Align size to huge pages for large allocations, otherwise the unaligned tail falls back to 4KB pages, increasing TLB pressure.
|
||||
size = round_up(size, mmap.PAGESIZE if should_use_sysmem else ((2 << 20) if size >= (8 << 20) else (4 << 10)))
|
||||
|
||||
if should_use_sysmem:
|
||||
vaddr = self.dev_impl.mm.alloc_vaddr(size:=round_up(size, mmap.PAGESIZE), align=mmap.PAGESIZE)
|
||||
memview, paddrs = self.pci_dev.alloc_sysmem(size, vaddr=vaddr, contiguous=contiguous)
|
||||
|
||||
Reference in New Issue
Block a user