Fix weight init: this work? (#391)

* this work? * glorot uniform * requies_grad broke * propagate the None correctly * so this weight init works * ahh, i think it's this * can't beat this * glorot is best for ae * remove comments
2026-06-13 00:15:35 +08:00 · 2022-09-25 16:46:33 -04:00
parent ff11c4316b
commit 60df954377
4 changed files with 23 additions and 14 deletions
--- a/examples/stable_diffusion.py
+++ b/examples/stable_diffusion.py
@@ -628,7 +628,6 @@ class StableDiffusion:
 FILENAME = "weights/sd-v1-4.ckpt"

 if __name__ == "__main__":
-  Tensor.no_init = True
  # WTF!! no_grad breaks it (only with OPENCL, now fixed)
  Tensor.no_grad = True
  model = StableDiffusion()
--- a/test/test_mnist.py
+++ b/test/test_mnist.py
@@ -14,8 +14,8 @@ X_train, Y_train, X_test, Y_test = fetch_mnist()
 # create a model
 class TinyBobNet:
  def __init__(self):
-    self.l1 = Tensor.uniform(784, 128)
-    self.l2 = Tensor.uniform(128, 10)
+    self.l1 = Tensor.scaled_uniform(784, 128)
+    self.l2 = Tensor.scaled_uniform(128, 10)

  def parameters(self):
    return get_parameters(self)
@@ -30,9 +30,9 @@ class TinyConvNet:
    conv = 3
    #inter_chan, out_chan = 32, 64
    inter_chan, out_chan = 8, 16   # for speed
-    self.c1 = Tensor.uniform(inter_chan,1,conv,conv)
-    self.c2 = Tensor.uniform(out_chan,inter_chan,conv,conv)
-    self.l1 = Tensor.uniform(out_chan*5*5, 10)
+    self.c1 = Tensor.scaled_uniform(inter_chan,1,conv,conv)
+    self.c2 = Tensor.scaled_uniform(out_chan,inter_chan,conv,conv)
+    self.l1 = Tensor.scaled_uniform(out_chan*5*5, 10)

  def parameters(self):
    return get_parameters(self)
--- a/tinygrad/nn/init.py
+++ b/tinygrad/nn/init.py
@@ -37,21 +37,22 @@ class BatchNorm2D:
      self.batch_invstd = batch_var.add(self.eps)**-0.5
    return batch_normalize(x, self.weight, self.bias, batch_mean, self.batch_invstd)

+# TODO: is this good weight init?
 class Conv2d:
  def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, bias=True):
    self.kernel_size = (kernel_size, kernel_size) if isinstance(kernel_size, int) else (kernel_size[0], kernel_size[1])
    self.stride = (stride, stride) if isinstance(stride, int) else (stride[0], stride[1])
    self.padding = (padding, ) * 4 if isinstance(padding, int) else ((padding[0], padding[0], padding[1], padding[1]) if len(padding) == 2 else padding)
-    self.weight = (Tensor.uniform if not Tensor.no_init else Tensor.empty)(out_channels, in_channels, self.kernel_size[0], self.kernel_size[1])
-    self.bias = (Tensor.uniform if not Tensor.no_init else Tensor.empty)(out_channels) if bias else None
+    self.weight = Tensor.glorot_uniform(out_channels, in_channels, self.kernel_size[0], self.kernel_size[1])
+    self.bias = Tensor.zeros(out_channels) if bias else None

  def __call__(self, x):
    return x.conv2d(self.weight, self.bias, padding=self.padding, stride=self.stride)

 class Linear:
  def __init__(self, in_features, out_features, bias=True):
-    self.weight = (Tensor.uniform if not Tensor.no_init else Tensor.empty)(out_features, in_features)
-    self.bias = (Tensor.zeros if not Tensor.no_init else Tensor.empty)(out_features) if bias else None
+    self.weight = Tensor.glorot_uniform(out_features, in_features)
+    self.bias = Tensor.zeros(out_features) if bias else None

  def __call__(self, x):
    return x.linear(self.weight.transpose(), self.bias)
--- a/tinygrad/tensor.py
+++ b/tinygrad/tensor.py
@@ -11,8 +11,7 @@ from tinygrad.ops import LazyBuffer
 # **** start with two base classes, Tensor and Function ****

 class Tensor:
-  # TODO: remove no_init when uniform is late bind
-  training, no_grad, no_init = False, False, False
+  training, no_grad = False, False

  def __init__(self, data, device=Device.DEFAULT, requires_grad=None):
    if isinstance(data, list):
@@ -106,8 +105,17 @@ class Tensor:
  def arange(cls, stop, start=0, **kwargs): return cls(np.arange(start=start, stop=stop, dtype=np.float32), **kwargs)

  # TODO: uniform should be a late binding thing
+  # Return random number between -1 and 1
+  # NOTE: this behavior changed from depending on the shape to not
  @classmethod
-  def uniform(cls, *shape, **kwargs): return cls(((np.random.default_rng().random(size=shape, dtype=np.float32) * 2 - 1) * prod(shape)**-0.5), **kwargs)
+  def uniform(cls, *shape, **kwargs): return cls((np.random.default_rng().random(size=shape, dtype=np.float32) * 2 - 1), **kwargs)
+
+  @classmethod
+  def scaled_uniform(cls, *shape, **kwargs): return cls((np.random.default_rng().random(size=shape, dtype=np.float32) * 2 - 1) * (prod(shape)**-0.5), **kwargs)
+
+  @classmethod
+  # https://www.tensorflow.org/api_docs/python/tf/keras/initializers/GlorotUniform
+  def glorot_uniform(cls, *shape, **kwargs): return cls((np.random.default_rng().random(size=shape, dtype=np.float32) * 2 - 1) * ((6/(shape[0]+prod(shape[1:])))**0.5), **kwargs)

  @classmethod
  def eye(cls, dim, **kwargs): return cls(np.eye(dim, dtype=np.float32), **kwargs)
@@ -254,6 +262,7 @@ class Tensor:

  def __neg__(self): return 0.0-self
  def sqrt(self): return self.pow(0.5)
+  def square(self): return self*self
  def clip(self, min_, max_): return ((self-min_).relu()+min_) - (self-max_).relu()
  def abs(self): return self.relu() + (-self).relu()
  def sign(self): return self / (self.abs() + 1e-10)
@@ -312,7 +321,7 @@ class Function:
  def __init__(self, device:str, *tensors:Tensor):
    self.device, self.parents = device, tensors
    self.needs_input_grad = [t.requires_grad for t in self.parents]
-    self.requires_grad = any(self.needs_input_grad)
+    self.requires_grad = True if any(self.needs_input_grad) else (None if any([x is None for x in self.needs_input_grad]) else False)
    self.saved_tensors : List[Tensor] = []

  def forward(self, *args, **kwargs): raise NotImplementedError(f"forward not implemented for {type(self)}")