diff --git a/extra/bandwidth_test.py b/extra/bandwidth_test.py new file mode 100644 index 0000000000..7e32ad8d75 --- /dev/null +++ b/extra/bandwidth_test.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python3 +from tinygrad import Tensor, Device, GlobalCounters, Context, dtypes +from tinygrad.helpers import getenv, colored + +SZ = 8_000_000_000 +GPUS = getenv("GPUS", 4) # TODO: expose a way in tinygrad to access this + +if __name__ == "__main__": + # create tensors + tens = [Tensor.ones(SZ, dtype=dtypes.uint8, device=f"{Device.DEFAULT}:{i}").contiguous() for i in range(GPUS)] + Tensor.realize(*tens) + + bw = [[0.0]*GPUS for _ in range(GPUS)] + for i in range(GPUS): + for j in range(GPUS): + GlobalCounters.reset() + with Context(DEBUG=2): + if i == j: + # this copy would be optimized out, just add 1 + (tens[i]+1).realize() + else: + tens[i].to(f"{Device.DEFAULT}:{j}").realize() + t = max(GlobalCounters.time_sum_s, 1e-9) + bw[i][j] = SZ / t / 1e9 # GB/s + + def fmt(x): + c = "green" if x > 50 else "yellow" if x > 20 else "red" + return colored(f"{x:6.1f}", c) + + # header + print(" " * 8 + " ".join(f"{'d'+str(j):>6}" for j in range(GPUS))) + # rows + for i in range(GPUS): + print(f"{'s'+str(i):>6} -> " + " ".join(fmt(x) for x in bw[i])) diff --git a/extra/gemm/torch_gemm.py b/extra/gemm/torch_gemm.py index aee1a8e6aa..da7d2bcd0e 100644 --- a/extra/gemm/torch_gemm.py +++ b/extra/gemm/torch_gemm.py @@ -12,7 +12,7 @@ MPS = getenv("MPS", 0) if getenv("FP16_ACC"): torch.backends.cuda.matmul.allow_fp16_accumulation = True for dtype in [torch.float32, torch.float16, torch.bfloat16]: - for N in [256, 512, 1024, 2048, 4096]: + for N in [256, 512, 1024, 2048, 4096] + ([6144, 8192] if getenv("BIG") else []): FLOPS = N*N*N*2 b = torch.rand((N,N), dtype=dtype) diff --git a/extra/gpuburn.py b/extra/gpuburn.py new file mode 100644 index 0000000000..2066773554 --- /dev/null +++ b/extra/gpuburn.py @@ -0,0 +1,16 @@ +from tinygrad import Tensor, Device, TinyJit, dtypes +from tinygrad.helpers import getenv + +GPUS = getenv("GPUS", 4) # TODO: expose a way in tinygrad to access this +N = 6144 + +@TinyJit +def many_matmul(A, B): + out = A + for _ in range(8): out = out@B + return out + +if __name__ == "__main__": + A = Tensor.ones(GPUS, N, N, dtype=dtypes.half).shard(devices=tuple([f"{Device.DEFAULT}:{i}" for i in range(GPUS)]), axis=0).contiguous() + B = Tensor.ones(GPUS, N, N, dtype=dtypes.half).shard(devices=tuple([f"{Device.DEFAULT}:{i}" for i in range(GPUS)]), axis=0).contiguous() + while 1: many_matmul(A, B) diff --git a/extra/mmapeak/mmapeak.py b/extra/mmapeak/mmapeak.py index 36c2cf842c..36e24fd372 100644 --- a/extra/mmapeak/mmapeak.py +++ b/extra/mmapeak/mmapeak.py @@ -32,7 +32,7 @@ def launchBenchmark(instruction, vgprIndices, dense=True, accum=False, extra="") src = src.replace("DIRECTIVE", DIRECTIVE) lib = COMPILER.compile(src) fxn = AMDProgram(DEV, "matmul", lib) - elapsed = fxn(global_size=(NUM_WORKGROUPS,1,1), local_size=(WAVE_SIZE*NUM_WAVES,1,1), wait=True) + elapsed = min([fxn(global_size=(NUM_WORKGROUPS,1,1), local_size=(WAVE_SIZE*NUM_WAVES,1,1), wait=True) for _ in range(2)]) FLOPs = FLOPS_PER_MATMUL * NUM_WAVES * NUM_WORKGROUPS * INTERNAL_LOOP * INSTRUCTIONS_PER_LOOP print(f"{instruction:<29} : {FLOPs/elapsed/10**12:.2f} T(FL)OPS") diff --git a/extra/mmapeak/template.s b/extra/mmapeak/template.s index ecf0704e2b..b84aba74f3 100644 --- a/extra/mmapeak/template.s +++ b/extra/mmapeak/template.s @@ -3,14 +3,14 @@ .p2align 8 .type matmul,@function matmul: - s_mov_b32 s1, INTERNAL_LOOP - s_mov_b32 s2, 0 - inner_loop: - INSTRUCTION - s_sub_u32 s1, s1, 1 - s_cmp_lg_i32 s1, s2 - s_cbranch_scc1 inner_loop - s_endpgm + s_mov_b32 s1, INTERNAL_LOOP + s_mov_b32 s2, 0 + inner_loop: + INSTRUCTION + s_sub_u32 s1, s1, 1 + s_cmp_lg_i32 s1, s2 + s_cbranch_scc1 inner_loop + s_endpgm .rodata .p2align 6