From 4b9bc1615b2aa08f700da9dd036af3091b6f2dae Mon Sep 17 00:00:00 2001
From: George Hotz <72895+geohot@users.noreply.github.com>
Date: Mon, 6 Mar 2023 09:13:23 -0800
Subject: [PATCH] While fusion (#654)

* try this

* readme

* opt comments
---
 README.md               | 4 +++-
 tinygrad/codegen/gpu.py | 8 +++++---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 06a483f95d..20e62576de 100644
--- a/README.md
+++ b/README.md
@@ -71,12 +71,14 @@ print(y.grad)  # dz/dy
 Try a matmul. See how, despite the style, it is fused into one kernel with the power of laziness.
 
 ```python
-OPTLOCAL=1 GPU=1 DEBUG=3 python3 -c "from tinygrad.tensor import Tensor;
+DEBUG=3 OPTLOCAL=1 GPU=1 python3 -c "from tinygrad.tensor import Tensor;
 N = 1024; a, b = Tensor.randn(N, N), Tensor.randn(N, N);
 c = (a.reshape(N, 1, N) * b.permute(1,0).reshape(1, N, N)).sum(axis=2);
 print((c.numpy() - (a.numpy() @ b.numpy())).mean())"
 ```
 
+Change to `DEBUG=4` to see the generated code.
+
 ## Neural networks?
 
 It turns out, a decent autograd tensor library is 90% of what you need for neural networks. Add an optimizer (SGD, RMSprop, and Adam implemented) from tinygrad.nn.optim, write some boilerplate minibatching code, and you have all you need.
diff --git a/tinygrad/codegen/gpu.py b/tinygrad/codegen/gpu.py
index 88beafc754..38b33a2412 100644
--- a/tinygrad/codegen/gpu.py
+++ b/tinygrad/codegen/gpu.py
@@ -200,8 +200,8 @@ class GPUCodegen(ASTKernel):
 
     # **** below this line need to be optional and benchmarked ****
 
-    # potentially do a second upcast based on a heuristic. this is optional and has nothing to do with float4
-    if prod(self.sts[0].shape[:self.first_reduce]) >= 1024:
+    # potentially do more upcasts of non reduce axes based on a heuristic
+    while prod(self.sts[0].shape[:self.first_reduce]) >= 1024:
       xb_choices = []
       for axis, upcast_amount in itertools.product(range(self.first_reduce), [3,4]):   # consider all the non reduce axes, and a 3 or 4 reduce
         # if it mods, and some buffer has stride 0 on axis while having no stride 0 in the buftoken
@@ -213,8 +213,10 @@ class GPUCodegen(ASTKernel):
         self.shift_to(xb_choices[0][2], amount=xb_choices[0][3])
         self.upcast()
         self.simplify_ones()
+      else:
+        break
 
-    # if last dim <= 5 and it's a reduce dim, upcast (loop unrolling). no simplify needed since it's just an upcast. NOTE: careful, this has broken VALIDHACKS
+    # if last dim <= 5 and it's a reduce dim, upcast the reduce (loop unrolling). no simplify needed since it's just an upcast. NOTE: careful, this has broken VALIDHACKS
     if self.first_reduce < self.shape_len and self.full_shape[-1] <= 5 and (max([x.size() for i,x in enumerate(self.buftokens) if self.bufs[i] in self.earlybufs]) <= 4 or not any(r for _,_,r in self.buftokens[self.full_buf_index].axis)):
       self.upcast()