diff --git a/test/external/speed_compare_cuda_nv.py b/test/external/speed_compare_cuda_nv.py new file mode 100644 index 0000000000..48b7578864 --- /dev/null +++ b/test/external/speed_compare_cuda_nv.py @@ -0,0 +1,55 @@ +from tinygrad import Device +from tinygrad.helpers import getenv, colored +from extra.optimization.helpers import load_worlds, ast_str_to_lin +from tinygrad.features.search import bufs_from_lin + +# move to helpers? +def colorize_float(x): + ret = f"{x:7.2f}x" + if x < 0.75: return colored(ret, 'green') + elif x > 1.15: return colored(ret, 'red') + else: return colored(ret, 'yellow') + +if __name__ == "__main__": + ast_strs = load_worlds(filter_reduce=False, filter_novariable=True) + cudev = Device["CUDA"] + nvdev = Device["NV"] + + # NUM=112 python3 test/external/speed_compare_cuda_nv.py + + single = getenv("NUM", -1) + if single != -1: ast_strs = ast_strs[single:single+1] + + average_tm_cuda, average_tm_nv = 0, 0 + for num,ast in enumerate(ast_strs): + # cuda compile + culin = ast_str_to_lin(ast, opts=cudev.compiler.compiler_opts) + culin.hand_coded_optimizations() + cuda_prg = cudev.to_program(culin) + cubufs = bufs_from_lin(culin) + + nvlin = ast_str_to_lin(ast, opts=nvdev.compiler.compiler_opts) + nvlin.hand_coded_optimizations() + nv_prg = nvdev.to_program(nvlin) + nvbufs = bufs_from_lin(nvlin) + + # warmup + tm_cuda, tm_nv = [], [] + try: + cuda_prg(cubufs, {}, wait=True) + for i in range(5): tm_cuda.append(cuda_prg(cubufs, {}, wait=True)) + except RuntimeError: + print("CUDA FAILED") + tm_cuda = [1e9] + + try: + nv_prg(nvbufs, {}, wait=True) + for i in range(5): tm_nv.append(nv_prg(nvbufs, {}, wait=True)) + except RuntimeError: + print("NV FAILED") + tm_nv = [1e9] + average_tm_cuda += min(tm_cuda) + average_tm_nv += min(tm_nv) + ratio = min(tm_nv)/min(tm_cuda) + print(f"{average_tm_nv/average_tm_cuda:5.2f}x -- {num:4d} {colorize_float(ratio)} {min(tm_nv)*1e6:7.2f} us", nvlin.name) + if ratio > 1.1: print(f"NV slower {ratio}", nvlin.ast, nvlin.applied_opts) diff --git a/tinygrad/runtime/ops_nv.py b/tinygrad/runtime/ops_nv.py index 4fa9ae8c12..71894bb1d0 100644 --- a/tinygrad/runtime/ops_nv.py +++ b/tinygrad/runtime/ops_nv.py @@ -196,7 +196,7 @@ class NVProgram: self.constbuffer_0 = [0] * 88 self.constbuffer_0[6:12] = [*nvdata64_le(self.device.shared_mem_window), *nvdata64_le(self.device.local_mem_window), *nvdata64_le(0xfffdc0)] - smem_config = min(shmem_conf * 1024 for shmem_conf in [8, 16, 32, 64, 96] if shmem_conf * 1024 >= self.shmem_usage) // 4096 + 1 + smem_config = min(shmem_conf * 1024 for shmem_conf in [32, 64, 100] if shmem_conf * 1024 >= self.shmem_usage) // 4096 + 1 self.qmd = qmd_struct_t(qmd_group_id=0x3f, sm_global_caching_enable=1, invalidate_texture_header_cache=1, invalidate_texture_sampler_cache=1, invalidate_texture_data_cache=1, invalidate_shader_data_cache=1, api_visible_call_limit=1, sampler_index=1, cwd_membar_type=nv_gpu.NVC6C0_QMDV03_00_CWD_MEMBAR_TYPE_L1_SYSMEMBAR, qmd_major_version=3,