diff --git a/examples/stable_diffusion.py b/examples/stable_diffusion.py
index fb62758bff..37eba6b463 100644
--- a/examples/stable_diffusion.py
+++ b/examples/stable_diffusion.py
@@ -628,7 +628,6 @@ class StableDiffusion:
 FILENAME = "weights/sd-v1-4.ckpt"
 
 if __name__ == "__main__":
-  Tensor.no_init = True
   # WTF!! no_grad breaks it (only with OPENCL, now fixed)
   Tensor.no_grad = True
   model = StableDiffusion()
diff --git a/test/test_mnist.py b/test/test_mnist.py
index 81bfcc402c..51a0da601a 100644
--- a/test/test_mnist.py
+++ b/test/test_mnist.py
@@ -14,8 +14,8 @@ X_train, Y_train, X_test, Y_test = fetch_mnist()
 # create a model
 class TinyBobNet:
   def __init__(self):
-    self.l1 = Tensor.uniform(784, 128)
-    self.l2 = Tensor.uniform(128, 10)
+    self.l1 = Tensor.scaled_uniform(784, 128)
+    self.l2 = Tensor.scaled_uniform(128, 10)
 
   def parameters(self):
     return get_parameters(self)
@@ -30,9 +30,9 @@ class TinyConvNet:
     conv = 3
     #inter_chan, out_chan = 32, 64
     inter_chan, out_chan = 8, 16   # for speed
-    self.c1 = Tensor.uniform(inter_chan,1,conv,conv)
-    self.c2 = Tensor.uniform(out_chan,inter_chan,conv,conv)
-    self.l1 = Tensor.uniform(out_chan*5*5, 10)
+    self.c1 = Tensor.scaled_uniform(inter_chan,1,conv,conv)
+    self.c2 = Tensor.scaled_uniform(out_chan,inter_chan,conv,conv)
+    self.l1 = Tensor.scaled_uniform(out_chan*5*5, 10)
 
   def parameters(self):
     return get_parameters(self)
diff --git a/tinygrad/nn/__init__.py b/tinygrad/nn/__init__.py
index 43c5cce8b6..86651fba62 100644
--- a/tinygrad/nn/__init__.py
+++ b/tinygrad/nn/__init__.py
@@ -37,21 +37,22 @@ class BatchNorm2D:
       self.batch_invstd = batch_var.add(self.eps)**-0.5
     return batch_normalize(x, self.weight, self.bias, batch_mean, self.batch_invstd)
 
+# TODO: is this good weight init?
 class Conv2d:
   def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, bias=True):
     self.kernel_size = (kernel_size, kernel_size) if isinstance(kernel_size, int) else (kernel_size[0], kernel_size[1])
     self.stride = (stride, stride) if isinstance(stride, int) else (stride[0], stride[1])
     self.padding = (padding, ) * 4 if isinstance(padding, int) else ((padding[0], padding[0], padding[1], padding[1]) if len(padding) == 2 else padding)
-    self.weight = (Tensor.uniform if not Tensor.no_init else Tensor.empty)(out_channels, in_channels, self.kernel_size[0], self.kernel_size[1])
-    self.bias = (Tensor.uniform if not Tensor.no_init else Tensor.empty)(out_channels) if bias else None
+    self.weight = Tensor.glorot_uniform(out_channels, in_channels, self.kernel_size[0], self.kernel_size[1])
+    self.bias = Tensor.zeros(out_channels) if bias else None
 
   def __call__(self, x):
     return x.conv2d(self.weight, self.bias, padding=self.padding, stride=self.stride)
 
 class Linear:
   def __init__(self, in_features, out_features, bias=True):
-    self.weight = (Tensor.uniform if not Tensor.no_init else Tensor.empty)(out_features, in_features)
-    self.bias = (Tensor.zeros if not Tensor.no_init else Tensor.empty)(out_features) if bias else None
+    self.weight = Tensor.glorot_uniform(out_features, in_features)
+    self.bias = Tensor.zeros(out_features) if bias else None
 
   def __call__(self, x):
     return x.linear(self.weight.transpose(), self.bias)
diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py
index cd98b30a96..1fa06ba51a 100644
--- a/tinygrad/tensor.py
+++ b/tinygrad/tensor.py
@@ -11,8 +11,7 @@ from tinygrad.ops import LazyBuffer
 # **** start with two base classes, Tensor and Function ****
 
 class Tensor:
-  # TODO: remove no_init when uniform is late bind
-  training, no_grad, no_init = False, False, False
+  training, no_grad = False, False
 
   def __init__(self, data, device=Device.DEFAULT, requires_grad=None):
     if isinstance(data, list):
@@ -106,8 +105,17 @@ class Tensor:
   def arange(cls, stop, start=0, **kwargs): return cls(np.arange(start=start, stop=stop, dtype=np.float32), **kwargs)
 
   # TODO: uniform should be a late binding thing
+  # Return random number between -1 and 1
+  # NOTE: this behavior changed from depending on the shape to not
   @classmethod
-  def uniform(cls, *shape, **kwargs): return cls(((np.random.default_rng().random(size=shape, dtype=np.float32) * 2 - 1) * prod(shape)**-0.5), **kwargs)
+  def uniform(cls, *shape, **kwargs): return cls((np.random.default_rng().random(size=shape, dtype=np.float32) * 2 - 1), **kwargs)
+
+  @classmethod
+  def scaled_uniform(cls, *shape, **kwargs): return cls((np.random.default_rng().random(size=shape, dtype=np.float32) * 2 - 1) * (prod(shape)**-0.5), **kwargs)
+
+  @classmethod
+  # https://www.tensorflow.org/api_docs/python/tf/keras/initializers/GlorotUniform
+  def glorot_uniform(cls, *shape, **kwargs): return cls((np.random.default_rng().random(size=shape, dtype=np.float32) * 2 - 1) * ((6/(shape[0]+prod(shape[1:])))**0.5), **kwargs)
 
   @classmethod
   def eye(cls, dim, **kwargs): return cls(np.eye(dim, dtype=np.float32), **kwargs)
@@ -254,6 +262,7 @@ class Tensor:
 
   def __neg__(self): return 0.0-self
   def sqrt(self): return self.pow(0.5)
+  def square(self): return self*self
   def clip(self, min_, max_): return ((self-min_).relu()+min_) - (self-max_).relu()
   def abs(self): return self.relu() + (-self).relu()
   def sign(self): return self / (self.abs() + 1e-10)
@@ -312,7 +321,7 @@ class Function:
   def __init__(self, device:str, *tensors:Tensor):
     self.device, self.parents = device, tensors
     self.needs_input_grad = [t.requires_grad for t in self.parents]
-    self.requires_grad = any(self.needs_input_grad)
+    self.requires_grad = True if any(self.needs_input_grad) else (None if any([x is None for x in self.needs_input_grad]) else False)
     self.saved_tensors : List[Tensor] = []
 
   def forward(self, *args, **kwargs): raise NotImplementedError(f"forward not implemented for {type(self)}")