Fix weight init: this work? (#391)

* this work?

* glorot uniform

* requies_grad broke

* propagate the None correctly

* so this weight init works

* ahh, i think it's this

* can't beat this

* glorot is best for ae

* remove comments
This commit is contained in:
George Hotz
2022-09-25 16:46:33 -04:00
committed by GitHub
parent ff11c4316b
commit 60df954377
4 changed files with 23 additions and 14 deletions

View File

@@ -628,7 +628,6 @@ class StableDiffusion:
FILENAME = "weights/sd-v1-4.ckpt"
if __name__ == "__main__":
Tensor.no_init = True
# WTF!! no_grad breaks it (only with OPENCL, now fixed)
Tensor.no_grad = True
model = StableDiffusion()

View File

@@ -14,8 +14,8 @@ X_train, Y_train, X_test, Y_test = fetch_mnist()
# create a model
class TinyBobNet:
def __init__(self):
self.l1 = Tensor.uniform(784, 128)
self.l2 = Tensor.uniform(128, 10)
self.l1 = Tensor.scaled_uniform(784, 128)
self.l2 = Tensor.scaled_uniform(128, 10)
def parameters(self):
return get_parameters(self)
@@ -30,9 +30,9 @@ class TinyConvNet:
conv = 3
#inter_chan, out_chan = 32, 64
inter_chan, out_chan = 8, 16 # for speed
self.c1 = Tensor.uniform(inter_chan,1,conv,conv)
self.c2 = Tensor.uniform(out_chan,inter_chan,conv,conv)
self.l1 = Tensor.uniform(out_chan*5*5, 10)
self.c1 = Tensor.scaled_uniform(inter_chan,1,conv,conv)
self.c2 = Tensor.scaled_uniform(out_chan,inter_chan,conv,conv)
self.l1 = Tensor.scaled_uniform(out_chan*5*5, 10)
def parameters(self):
return get_parameters(self)

View File

@@ -37,21 +37,22 @@ class BatchNorm2D:
self.batch_invstd = batch_var.add(self.eps)**-0.5
return batch_normalize(x, self.weight, self.bias, batch_mean, self.batch_invstd)
# TODO: is this good weight init?
class Conv2d:
def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, bias=True):
self.kernel_size = (kernel_size, kernel_size) if isinstance(kernel_size, int) else (kernel_size[0], kernel_size[1])
self.stride = (stride, stride) if isinstance(stride, int) else (stride[0], stride[1])
self.padding = (padding, ) * 4 if isinstance(padding, int) else ((padding[0], padding[0], padding[1], padding[1]) if len(padding) == 2 else padding)
self.weight = (Tensor.uniform if not Tensor.no_init else Tensor.empty)(out_channels, in_channels, self.kernel_size[0], self.kernel_size[1])
self.bias = (Tensor.uniform if not Tensor.no_init else Tensor.empty)(out_channels) if bias else None
self.weight = Tensor.glorot_uniform(out_channels, in_channels, self.kernel_size[0], self.kernel_size[1])
self.bias = Tensor.zeros(out_channels) if bias else None
def __call__(self, x):
return x.conv2d(self.weight, self.bias, padding=self.padding, stride=self.stride)
class Linear:
def __init__(self, in_features, out_features, bias=True):
self.weight = (Tensor.uniform if not Tensor.no_init else Tensor.empty)(out_features, in_features)
self.bias = (Tensor.zeros if not Tensor.no_init else Tensor.empty)(out_features) if bias else None
self.weight = Tensor.glorot_uniform(out_features, in_features)
self.bias = Tensor.zeros(out_features) if bias else None
def __call__(self, x):
return x.linear(self.weight.transpose(), self.bias)

View File

@@ -11,8 +11,7 @@ from tinygrad.ops import LazyBuffer
# **** start with two base classes, Tensor and Function ****
class Tensor:
# TODO: remove no_init when uniform is late bind
training, no_grad, no_init = False, False, False
training, no_grad = False, False
def __init__(self, data, device=Device.DEFAULT, requires_grad=None):
if isinstance(data, list):
@@ -106,8 +105,17 @@ class Tensor:
def arange(cls, stop, start=0, **kwargs): return cls(np.arange(start=start, stop=stop, dtype=np.float32), **kwargs)
# TODO: uniform should be a late binding thing
# Return random number between -1 and 1
# NOTE: this behavior changed from depending on the shape to not
@classmethod
def uniform(cls, *shape, **kwargs): return cls(((np.random.default_rng().random(size=shape, dtype=np.float32) * 2 - 1) * prod(shape)**-0.5), **kwargs)
def uniform(cls, *shape, **kwargs): return cls((np.random.default_rng().random(size=shape, dtype=np.float32) * 2 - 1), **kwargs)
@classmethod
def scaled_uniform(cls, *shape, **kwargs): return cls((np.random.default_rng().random(size=shape, dtype=np.float32) * 2 - 1) * (prod(shape)**-0.5), **kwargs)
@classmethod
# https://www.tensorflow.org/api_docs/python/tf/keras/initializers/GlorotUniform
def glorot_uniform(cls, *shape, **kwargs): return cls((np.random.default_rng().random(size=shape, dtype=np.float32) * 2 - 1) * ((6/(shape[0]+prod(shape[1:])))**0.5), **kwargs)
@classmethod
def eye(cls, dim, **kwargs): return cls(np.eye(dim, dtype=np.float32), **kwargs)
@@ -254,6 +262,7 @@ class Tensor:
def __neg__(self): return 0.0-self
def sqrt(self): return self.pow(0.5)
def square(self): return self*self
def clip(self, min_, max_): return ((self-min_).relu()+min_) - (self-max_).relu()
def abs(self): return self.relu() + (-self).relu()
def sign(self): return self / (self.abs() + 1e-10)
@@ -312,7 +321,7 @@ class Function:
def __init__(self, device:str, *tensors:Tensor):
self.device, self.parents = device, tensors
self.needs_input_grad = [t.requires_grad for t in self.parents]
self.requires_grad = any(self.needs_input_grad)
self.requires_grad = True if any(self.needs_input_grad) else (None if any([x is None for x in self.needs_input_grad]) else False)
self.saved_tensors : List[Tensor] = []
def forward(self, *args, **kwargs): raise NotImplementedError(f"forward not implemented for {type(self)}")