mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-06-13 00:15:35 +08:00
Fix weight init: this work? (#391)
* this work? * glorot uniform * requies_grad broke * propagate the None correctly * so this weight init works * ahh, i think it's this * can't beat this * glorot is best for ae * remove comments
This commit is contained in:
@@ -628,7 +628,6 @@ class StableDiffusion:
|
||||
FILENAME = "weights/sd-v1-4.ckpt"
|
||||
|
||||
if __name__ == "__main__":
|
||||
Tensor.no_init = True
|
||||
# WTF!! no_grad breaks it (only with OPENCL, now fixed)
|
||||
Tensor.no_grad = True
|
||||
model = StableDiffusion()
|
||||
|
||||
@@ -14,8 +14,8 @@ X_train, Y_train, X_test, Y_test = fetch_mnist()
|
||||
# create a model
|
||||
class TinyBobNet:
|
||||
def __init__(self):
|
||||
self.l1 = Tensor.uniform(784, 128)
|
||||
self.l2 = Tensor.uniform(128, 10)
|
||||
self.l1 = Tensor.scaled_uniform(784, 128)
|
||||
self.l2 = Tensor.scaled_uniform(128, 10)
|
||||
|
||||
def parameters(self):
|
||||
return get_parameters(self)
|
||||
@@ -30,9 +30,9 @@ class TinyConvNet:
|
||||
conv = 3
|
||||
#inter_chan, out_chan = 32, 64
|
||||
inter_chan, out_chan = 8, 16 # for speed
|
||||
self.c1 = Tensor.uniform(inter_chan,1,conv,conv)
|
||||
self.c2 = Tensor.uniform(out_chan,inter_chan,conv,conv)
|
||||
self.l1 = Tensor.uniform(out_chan*5*5, 10)
|
||||
self.c1 = Tensor.scaled_uniform(inter_chan,1,conv,conv)
|
||||
self.c2 = Tensor.scaled_uniform(out_chan,inter_chan,conv,conv)
|
||||
self.l1 = Tensor.scaled_uniform(out_chan*5*5, 10)
|
||||
|
||||
def parameters(self):
|
||||
return get_parameters(self)
|
||||
|
||||
@@ -37,21 +37,22 @@ class BatchNorm2D:
|
||||
self.batch_invstd = batch_var.add(self.eps)**-0.5
|
||||
return batch_normalize(x, self.weight, self.bias, batch_mean, self.batch_invstd)
|
||||
|
||||
# TODO: is this good weight init?
|
||||
class Conv2d:
|
||||
def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, bias=True):
|
||||
self.kernel_size = (kernel_size, kernel_size) if isinstance(kernel_size, int) else (kernel_size[0], kernel_size[1])
|
||||
self.stride = (stride, stride) if isinstance(stride, int) else (stride[0], stride[1])
|
||||
self.padding = (padding, ) * 4 if isinstance(padding, int) else ((padding[0], padding[0], padding[1], padding[1]) if len(padding) == 2 else padding)
|
||||
self.weight = (Tensor.uniform if not Tensor.no_init else Tensor.empty)(out_channels, in_channels, self.kernel_size[0], self.kernel_size[1])
|
||||
self.bias = (Tensor.uniform if not Tensor.no_init else Tensor.empty)(out_channels) if bias else None
|
||||
self.weight = Tensor.glorot_uniform(out_channels, in_channels, self.kernel_size[0], self.kernel_size[1])
|
||||
self.bias = Tensor.zeros(out_channels) if bias else None
|
||||
|
||||
def __call__(self, x):
|
||||
return x.conv2d(self.weight, self.bias, padding=self.padding, stride=self.stride)
|
||||
|
||||
class Linear:
|
||||
def __init__(self, in_features, out_features, bias=True):
|
||||
self.weight = (Tensor.uniform if not Tensor.no_init else Tensor.empty)(out_features, in_features)
|
||||
self.bias = (Tensor.zeros if not Tensor.no_init else Tensor.empty)(out_features) if bias else None
|
||||
self.weight = Tensor.glorot_uniform(out_features, in_features)
|
||||
self.bias = Tensor.zeros(out_features) if bias else None
|
||||
|
||||
def __call__(self, x):
|
||||
return x.linear(self.weight.transpose(), self.bias)
|
||||
|
||||
@@ -11,8 +11,7 @@ from tinygrad.ops import LazyBuffer
|
||||
# **** start with two base classes, Tensor and Function ****
|
||||
|
||||
class Tensor:
|
||||
# TODO: remove no_init when uniform is late bind
|
||||
training, no_grad, no_init = False, False, False
|
||||
training, no_grad = False, False
|
||||
|
||||
def __init__(self, data, device=Device.DEFAULT, requires_grad=None):
|
||||
if isinstance(data, list):
|
||||
@@ -106,8 +105,17 @@ class Tensor:
|
||||
def arange(cls, stop, start=0, **kwargs): return cls(np.arange(start=start, stop=stop, dtype=np.float32), **kwargs)
|
||||
|
||||
# TODO: uniform should be a late binding thing
|
||||
# Return random number between -1 and 1
|
||||
# NOTE: this behavior changed from depending on the shape to not
|
||||
@classmethod
|
||||
def uniform(cls, *shape, **kwargs): return cls(((np.random.default_rng().random(size=shape, dtype=np.float32) * 2 - 1) * prod(shape)**-0.5), **kwargs)
|
||||
def uniform(cls, *shape, **kwargs): return cls((np.random.default_rng().random(size=shape, dtype=np.float32) * 2 - 1), **kwargs)
|
||||
|
||||
@classmethod
|
||||
def scaled_uniform(cls, *shape, **kwargs): return cls((np.random.default_rng().random(size=shape, dtype=np.float32) * 2 - 1) * (prod(shape)**-0.5), **kwargs)
|
||||
|
||||
@classmethod
|
||||
# https://www.tensorflow.org/api_docs/python/tf/keras/initializers/GlorotUniform
|
||||
def glorot_uniform(cls, *shape, **kwargs): return cls((np.random.default_rng().random(size=shape, dtype=np.float32) * 2 - 1) * ((6/(shape[0]+prod(shape[1:])))**0.5), **kwargs)
|
||||
|
||||
@classmethod
|
||||
def eye(cls, dim, **kwargs): return cls(np.eye(dim, dtype=np.float32), **kwargs)
|
||||
@@ -254,6 +262,7 @@ class Tensor:
|
||||
|
||||
def __neg__(self): return 0.0-self
|
||||
def sqrt(self): return self.pow(0.5)
|
||||
def square(self): return self*self
|
||||
def clip(self, min_, max_): return ((self-min_).relu()+min_) - (self-max_).relu()
|
||||
def abs(self): return self.relu() + (-self).relu()
|
||||
def sign(self): return self / (self.abs() + 1e-10)
|
||||
@@ -312,7 +321,7 @@ class Function:
|
||||
def __init__(self, device:str, *tensors:Tensor):
|
||||
self.device, self.parents = device, tensors
|
||||
self.needs_input_grad = [t.requires_grad for t in self.parents]
|
||||
self.requires_grad = any(self.needs_input_grad)
|
||||
self.requires_grad = True if any(self.needs_input_grad) else (None if any([x is None for x in self.needs_input_grad]) else False)
|
||||
self.saved_tensors : List[Tensor] = []
|
||||
|
||||
def forward(self, *args, **kwargs): raise NotImplementedError(f"forward not implemented for {type(self)}")
|
||||
|
||||
Reference in New Issue
Block a user