diff --git a/examples/stable_diffusion.py b/examples/stable_diffusion.py index fb62758bff..37eba6b463 100644 --- a/examples/stable_diffusion.py +++ b/examples/stable_diffusion.py @@ -628,7 +628,6 @@ class StableDiffusion: FILENAME = "weights/sd-v1-4.ckpt" if __name__ == "__main__": - Tensor.no_init = True # WTF!! no_grad breaks it (only with OPENCL, now fixed) Tensor.no_grad = True model = StableDiffusion() diff --git a/test/test_mnist.py b/test/test_mnist.py index 81bfcc402c..51a0da601a 100644 --- a/test/test_mnist.py +++ b/test/test_mnist.py @@ -14,8 +14,8 @@ X_train, Y_train, X_test, Y_test = fetch_mnist() # create a model class TinyBobNet: def __init__(self): - self.l1 = Tensor.uniform(784, 128) - self.l2 = Tensor.uniform(128, 10) + self.l1 = Tensor.scaled_uniform(784, 128) + self.l2 = Tensor.scaled_uniform(128, 10) def parameters(self): return get_parameters(self) @@ -30,9 +30,9 @@ class TinyConvNet: conv = 3 #inter_chan, out_chan = 32, 64 inter_chan, out_chan = 8, 16 # for speed - self.c1 = Tensor.uniform(inter_chan,1,conv,conv) - self.c2 = Tensor.uniform(out_chan,inter_chan,conv,conv) - self.l1 = Tensor.uniform(out_chan*5*5, 10) + self.c1 = Tensor.scaled_uniform(inter_chan,1,conv,conv) + self.c2 = Tensor.scaled_uniform(out_chan,inter_chan,conv,conv) + self.l1 = Tensor.scaled_uniform(out_chan*5*5, 10) def parameters(self): return get_parameters(self) diff --git a/tinygrad/nn/__init__.py b/tinygrad/nn/__init__.py index 43c5cce8b6..86651fba62 100644 --- a/tinygrad/nn/__init__.py +++ b/tinygrad/nn/__init__.py @@ -37,21 +37,22 @@ class BatchNorm2D: self.batch_invstd = batch_var.add(self.eps)**-0.5 return batch_normalize(x, self.weight, self.bias, batch_mean, self.batch_invstd) +# TODO: is this good weight init? class Conv2d: def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, bias=True): self.kernel_size = (kernel_size, kernel_size) if isinstance(kernel_size, int) else (kernel_size[0], kernel_size[1]) self.stride = (stride, stride) if isinstance(stride, int) else (stride[0], stride[1]) self.padding = (padding, ) * 4 if isinstance(padding, int) else ((padding[0], padding[0], padding[1], padding[1]) if len(padding) == 2 else padding) - self.weight = (Tensor.uniform if not Tensor.no_init else Tensor.empty)(out_channels, in_channels, self.kernel_size[0], self.kernel_size[1]) - self.bias = (Tensor.uniform if not Tensor.no_init else Tensor.empty)(out_channels) if bias else None + self.weight = Tensor.glorot_uniform(out_channels, in_channels, self.kernel_size[0], self.kernel_size[1]) + self.bias = Tensor.zeros(out_channels) if bias else None def __call__(self, x): return x.conv2d(self.weight, self.bias, padding=self.padding, stride=self.stride) class Linear: def __init__(self, in_features, out_features, bias=True): - self.weight = (Tensor.uniform if not Tensor.no_init else Tensor.empty)(out_features, in_features) - self.bias = (Tensor.zeros if not Tensor.no_init else Tensor.empty)(out_features) if bias else None + self.weight = Tensor.glorot_uniform(out_features, in_features) + self.bias = Tensor.zeros(out_features) if bias else None def __call__(self, x): return x.linear(self.weight.transpose(), self.bias) diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py index cd98b30a96..1fa06ba51a 100644 --- a/tinygrad/tensor.py +++ b/tinygrad/tensor.py @@ -11,8 +11,7 @@ from tinygrad.ops import LazyBuffer # **** start with two base classes, Tensor and Function **** class Tensor: - # TODO: remove no_init when uniform is late bind - training, no_grad, no_init = False, False, False + training, no_grad = False, False def __init__(self, data, device=Device.DEFAULT, requires_grad=None): if isinstance(data, list): @@ -106,8 +105,17 @@ class Tensor: def arange(cls, stop, start=0, **kwargs): return cls(np.arange(start=start, stop=stop, dtype=np.float32), **kwargs) # TODO: uniform should be a late binding thing + # Return random number between -1 and 1 + # NOTE: this behavior changed from depending on the shape to not @classmethod - def uniform(cls, *shape, **kwargs): return cls(((np.random.default_rng().random(size=shape, dtype=np.float32) * 2 - 1) * prod(shape)**-0.5), **kwargs) + def uniform(cls, *shape, **kwargs): return cls((np.random.default_rng().random(size=shape, dtype=np.float32) * 2 - 1), **kwargs) + + @classmethod + def scaled_uniform(cls, *shape, **kwargs): return cls((np.random.default_rng().random(size=shape, dtype=np.float32) * 2 - 1) * (prod(shape)**-0.5), **kwargs) + + @classmethod + # https://www.tensorflow.org/api_docs/python/tf/keras/initializers/GlorotUniform + def glorot_uniform(cls, *shape, **kwargs): return cls((np.random.default_rng().random(size=shape, dtype=np.float32) * 2 - 1) * ((6/(shape[0]+prod(shape[1:])))**0.5), **kwargs) @classmethod def eye(cls, dim, **kwargs): return cls(np.eye(dim, dtype=np.float32), **kwargs) @@ -254,6 +262,7 @@ class Tensor: def __neg__(self): return 0.0-self def sqrt(self): return self.pow(0.5) + def square(self): return self*self def clip(self, min_, max_): return ((self-min_).relu()+min_) - (self-max_).relu() def abs(self): return self.relu() + (-self).relu() def sign(self): return self / (self.abs() + 1e-10) @@ -312,7 +321,7 @@ class Function: def __init__(self, device:str, *tensors:Tensor): self.device, self.parents = device, tensors self.needs_input_grad = [t.requires_grad for t in self.parents] - self.requires_grad = any(self.needs_input_grad) + self.requires_grad = True if any(self.needs_input_grad) else (None if any([x is None for x in self.needs_input_grad]) else False) self.saved_tensors : List[Tensor] = [] def forward(self, *args, **kwargs): raise NotImplementedError(f"forward not implemented for {type(self)}")