From 8b5bcf309a2ee4f2f6ba3c0f2eee97e5220dd5e3 Mon Sep 17 00:00:00 2001
From: qazal <77887910+Qazalin@users.noreply.github.com>
Date: Tue, 11 Jun 2024 02:49:29 +0800
Subject: [PATCH] process replay in all of CI (#4884)

---
 .github/workflows/test.yml          | 53 ++++++++++++++++++++++-------
 test/test_fusion_op.py              |  2 --
 test/test_search.py                 |  3 +-
 test/testextra/test_export_model.py |  2 --
 tinygrad/codegen/linearizer.py      |  2 +-
 5 files changed, 42 insertions(+), 20 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 4161d9dddf..25bbaa8171 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -17,6 +17,28 @@ on:
          type: boolean
 
 jobs:
+  check_process_replay:
+    name: Check process replay
+    runs-on: ubuntu-latest
+    outputs:
+      run_process_replay: ${{ steps.set-env.outputs.run_process_replay }}
+    timeout-minutes: 5
+
+    steps:
+    - name: Checkout Code
+      uses: actions/checkout@v4
+      with:
+        fetch-depth: 2 # NOTE: this fetches the HEAD commit of the PR
+    - name: Set process replay
+      id: set-env
+      run: |
+        COMMIT_MESSAGE=$(git show -s --format=%B ${{ github.event.pull_request.head.sha }})
+        if { echo "$COMMIT_MESSAGE" | grep -q "\[run_process_replay\]" || [ "${{ github.event.inputs.run_process_replay }}" == "true" ]; } && [ "$GITHUB_REF_NAME" != "master" ]; then
+          echo "RUN_PROCESS_REPLAY=1" >> $GITHUB_OUTPUT
+        else
+          echo "RUN_PROCESS_REPLAY=0" >> $GITHUB_OUTPUT
+        fi
+
   uops:
     name: uops tests
     runs-on: ubuntu-latest
@@ -126,6 +148,10 @@ jobs:
       fail-fast: false
       matrix:
         task: [optimage, openpilot, onnx]
+    needs: check_process_replay
+    #env:
+      #RUN_PROCESS_REPLAY: ${{ needs.check_process_replay.outputs.run_process_replay }}
+
     name: ${{ matrix.task=='optimage'&&'GPU OPT and IMAGE Tests' || matrix.task=='openpilot'&&'openpilot (OpenCL) Tests' || matrix.task=='onnx'&&'ONNX+Optimization Tests' }}
     runs-on: ubuntu-20.04
     timeout-minutes: 10
@@ -211,6 +237,9 @@ jobs:
       - if: ${{ matrix.task == 'onnx' }}
         name: Test THREEFRY
         run: PYTHONPATH=. THREEFRY=1 GPU=1 python3 -m pytest test/test_randomness.py test/test_jit.py
+      - name: Run process replay tests
+        if: env.RUN_PROCESS_REPLAY == '1'
+        run: cp test/external/replay_codegen.py ./replay_codegen.py && git fetch origin master && git checkout origin/master && PYTHONPATH=. python3 replay_codegen.py
 
   #testwebgpu:
   #  name: WebGPU Tests
@@ -256,6 +285,9 @@ jobs:
   testmetal:
     name: Metal Tests
     runs-on: macos-14
+    needs: check_process_replay
+    env:
+      RUN_PROCESS_REPLAY: ${{ needs.check_process_replay.outputs.run_process_replay }}
     timeout-minutes: 10
 
     steps:
@@ -299,7 +331,9 @@ jobs:
       run: PYTHONPATH="." METAL=1 CACHELEVEL=0 FUZZ_ALL_ACTIONS=1 DEPTH=2 FUZZ_N=48 FUZZ_MAX_SIZE=10000000 python test/external/fuzz_linearizer.py
     - name: Fuzz Test models schedule
       run: FUZZ_SCHEDULE=1 FUZZ_SCHEDULE_MAX_PATHS=5 python -m pytest test/models/test_train.py test/models/test_end2end.py
-
+    - name: Run process replay tests
+      if: env.RUN_PROCESS_REPLAY == '1'
+      run: cp test/external/replay_codegen.py ./replay_codegen.py && git fetch origin master && git checkout origin/master && PYTHONPATH=. python3 replay_codegen.py
 
 #  testwebgl:
 #    name: WebGL Tests
@@ -339,6 +373,9 @@ jobs:
       fail-fast: false
       matrix:
         backend: [llvm, clang, gpu, ptx, amd, nv] #, triton]
+    needs: check_process_replay
+    env:
+      RUN_PROCESS_REPLAY: ${{ needs.check_process_replay.outputs.run_process_replay }}
 
     name: Tests on (${{ matrix.backend }})
     runs-on: ubuntu-latest
@@ -347,8 +384,6 @@ jobs:
     steps:
       - name: Checkout Code
         uses: actions/checkout@v4
-        with:
-          fetch-depth: 2 # NOTE: this fetches the HEAD commit of the PR
       - name: Set up Python 3.11
         uses: actions/setup-python@v5
         with:
@@ -364,12 +399,7 @@ jobs:
           path: ~/.cache/tinygrad/downloads/
           key: downloads-cache-${{ matrix.backend }}-${{ env.DOWNLOAD_CACHE_VERSION }}
       - name: Set env
-        run: |
-          COMMIT_MESSAGE=$(git show -s --format=%B ${{ github.event.pull_request.head.sha }})
-          if { echo "$COMMIT_MESSAGE" | grep -q "\[run_process_replay\]" || [ "${{ github.event.inputs.run_process_replay }}" == "true" ]; } && [ "$GITHUB_REF_NAME" != "master" ]; then
-            echo "RUN_PROCESS_REPLAY=1" >> $GITHUB_ENV
-          fi
-          printf "${{ matrix.backend == 'llvm' && 'LLVM=1' || matrix.backend == 'clang' && 'CLANG=1' || matrix.backend == 'gpu' && 'GPU=1' || matrix.backend == 'PTX' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nCUDA=1\nCUDACPU=1\nPTX=1' || matrix.backend == 'triton' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nCUDA=1\nCUDACPU=1\nTRITON=1\nTRITON_PTXAS_PATH=/usr/bin/ptxas' || matrix.backend == 'amd' && 'AMD=1\nMOCKGPU=1\nFORWARD_ONLY=1' || matrix.backend == 'nv' && 'NV=1\nMOCKGPU=1\nFORWARD_ONLY=1' }}" >> $GITHUB_ENV
+        run: printf "${{ matrix.backend == 'llvm' && 'LLVM=1' || matrix.backend == 'clang' && 'CLANG=1' || matrix.backend == 'gpu' && 'GPU=1' || matrix.backend == 'PTX' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nCUDA=1\nCUDACPU=1\nPTX=1' || matrix.backend == 'triton' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nCUDA=1\nCUDACPU=1\nTRITON=1\nTRITON_PTXAS_PATH=/usr/bin/ptxas' || matrix.backend == 'amd' && 'AMD=1\nMOCKGPU=1\nFORWARD_ONLY=1' || matrix.backend == 'nv' && 'NV=1\nMOCKGPU=1\nFORWARD_ONLY=1' }}" >> $GITHUB_ENV
       - name: Install OpenCL
         if: matrix.backend == 'gpu'
         run: |
@@ -486,10 +516,7 @@ jobs:
           cat test/models/efficientnet/Chicken.jpg | ./recognize | grep cock
       - name: Run process replay tests
         if: env.RUN_PROCESS_REPLAY == '1'
-        run: |
-          cp test/external/replay_codegen.py ./replay_codegen.py
-          git fetch origin master && git checkout origin/master
-          PYTHONPATH=. python3 replay_codegen.py
+        run: cp test/external/replay_codegen.py ./replay_codegen.py && git fetch origin master && git checkout origin/master && PYTHONPATH=. python3 replay_codegen.py
 
   #testunicorn:
   #  name: ARM64 unicorn Test
diff --git a/test/test_fusion_op.py b/test/test_fusion_op.py
index b0de3aed6b..faab57cf8c 100644
--- a/test/test_fusion_op.py
+++ b/test/test_fusion_op.py
@@ -4,7 +4,6 @@ import numpy as np
 from tinygrad import Tensor, dtypes
 from tinygrad.engine.schedule import create_schedule
 from tinygrad.engine.realize import lower_schedule_item, run_schedule
-from tinygrad.helpers import getenv
 
 class TestFusionOp(unittest.TestCase):
   def test_contiguous_add(self):
@@ -23,7 +22,6 @@ class TestFusionOp(unittest.TestCase):
     outd = out.tolist()
     assert all(x == 20.0 for x in outd)
 
-  @unittest.skipIf(getenv("RUN_PROCESS_REPLAY"), "very slow")
   def test_recursive_add(self):
     st = time.perf_counter()
     a = Tensor([1,2,3,4])
diff --git a/test/test_search.py b/test/test_search.py
index 511f838686..057930c514 100644
--- a/test/test_search.py
+++ b/test/test_search.py
@@ -8,7 +8,7 @@ from tinygrad.device import Device, Buffer
 from tinygrad.ops import LazyOp, LoadOps, BufferOps, ReduceOps, BinaryOps, MemBuffer, ConstBuffer
 from tinygrad.tensor import Tensor
 from tinygrad.dtype import dtypes
-from tinygrad.helpers import Context, GlobalCounters, getenv
+from tinygrad.helpers import Context, GlobalCounters
 from tinygrad.engine.realize import capturing
 from tinygrad.shape.shapetracker import ShapeTracker
 from tinygrad.shape.view import View
@@ -43,7 +43,6 @@ class TestTimeLinearizer(unittest.TestCase):
     time_linearizer(lin, bufs, allow_test_size=False, cnt=2, disable_cache=True, clear_l2=True)
     assert GlobalCounters.kernel_count == kernel_count, "kernel count was incremented by time_linearizer"
 
-@unittest.skipIf(getenv("RUN_PROCESS_REPLAY"), "TODO: run process replay for BEAM=2")
 class TestBEAM(unittest.TestCase):
   def test_dynamic_beam(self):
     # TODO: make this infra globally usable
diff --git a/test/testextra/test_export_model.py b/test/testextra/test_export_model.py
index 9c203f1609..4d0671c39b 100644
--- a/test/testextra/test_export_model.py
+++ b/test/testextra/test_export_model.py
@@ -1,6 +1,5 @@
 import unittest
 from extra.export_model import export_model, EXPORT_SUPPORTED_DEVICE
-from tinygrad.helpers import getenv
 from tinygrad.tensor import Tensor, Device
 import json
 
@@ -14,7 +13,6 @@ class MockMultiOutputModel:
 
 # TODO: move compile_efficientnet tests here
 @unittest.skipUnless(Device.DEFAULT in EXPORT_SUPPORTED_DEVICE, f"Model export is not supported on {Device.DEFAULT}")
-@unittest.skipIf(getenv("RUN_PROCESS_REPLAY"), "TODO: kernel ordering is non-deterministic")
 class TextModelExport(unittest.TestCase):
   def test_multi_input_model_export(self):
     model = MockMultiInputModel()
diff --git a/tinygrad/codegen/linearizer.py b/tinygrad/codegen/linearizer.py
index 4d829082a3..dd0395db0f 100644
--- a/tinygrad/codegen/linearizer.py
+++ b/tinygrad/codegen/linearizer.py
@@ -467,7 +467,7 @@ class Linearizer(Kernel):
     self.linearize()
     info = get_lazyop_info(self.ast[0])
     src = self.opts.render(to_function_name(self.name), self.uops)
-    if getenv("RUN_PROCESS_REPLAY"): diskcache_put("process_replay", "".join(map(str,[self.ast,self.applied_opts])), self)
+    if getenv("RUN_PROCESS_REPLAY"): diskcache_put("process_replay", id(self), self)
     ops, mem = self.uops.flops_mem()
     run_count = prod((self.global_size if self.global_size else []) + (self.local_size if self.local_size else []))
     # NOTE: we use min here to ignore the indexing FLOPS