diff --git a/.gitignore b/.gitignore
index fec6731e07..4394726d7d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,7 @@ __pycache__
 notebooks
 .*.swp
 .*.swo
+*.pyc
 build
 dist
 *.egg-info
diff --git a/docs/yolo_by_tinygrad.jpg b/docs/yolo_by_tinygrad.jpg
new file mode 100644
index 0000000000..85785ddba5
Binary files /dev/null and b/docs/yolo_by_tinygrad.jpg differ
diff --git a/examples/yolo_nn.py b/examples/yolo_nn.py
new file mode 100644
index 0000000000..f2f64b281a
--- /dev/null
+++ b/examples/yolo_nn.py
@@ -0,0 +1,92 @@
+from tinygrad.tensor import Tensor
+
+# PyTorch style layers for tinygrad. These layers are here because of tinygrads
+# line limit.
+
+class MaxPool2d:
+  def __init__(self, kernel_size, stride):
+    if type(kernel_size) == int:
+      self.kernel_size = (kernel_size, kernel_size)
+    else: self.kernel_size = kernel_size
+    self.stride = stride if (stride is not None) else kernel_size
+  
+  def __repr__(self):
+    return f"MaxPool2d(kernel_size={self.kernel_size!r}, stride={self.stride!r})"
+  
+  def __call__(self, input):
+    # TODO: Implement strided max_pool2d, and maxpool2d for 3d inputs
+    return x.max_pool2d(kernel_size=self.kernel_size)
+
+
+class DetectionLayer:
+  def __init__(self, anchors):
+    self.anchors = anchors
+  
+  def __call__(self, input):
+    return input
+
+class EmptyLayer:
+  def __init__(self):
+    pass
+  
+  def __call__(self, input):
+    return input
+
+class Upsample:
+  def __init__(self, scale_factor = 2, mode = "nearest"):
+    self.scale_factor, self.mode = scale_factor, mode
+  
+  def upsampleNearest(self, input):
+    # TODO: Implement actual interpolation function
+    # inspired: https://github.com/pytorch/pytorch/blob/master/torch/csrc/api/include/torch/nn/functional/upsampling.h
+    return input.cpu().data.repeat(self.scale_factor, axis=len(input.shape)-2).repeat(self.scale_factor, axis=len(input.shape)-1)
+
+  def __repr__(self):
+    return f"Upsample(scale_factor={self.scale_factor!r}, mode={self.mode!r})"
+
+  def __call__(self, input):
+    return Tensor(self.upsampleNearest(input))
+
+class LeakyReLU:
+  def __init__(self, neg_slope):
+    self.neg_slope = neg_slope
+  
+  def __repr__(self):
+    return f"LeakyReLU({self.neg_slope!r})"
+
+  def __call__(self, input):
+    return input.leakyrelu(self.neg_slope)
+
+
+class Conv2d:
+  def __init__(self, in_channels, out_channels, kernel_size, stride = 1, padding = 0, groups = 1, bias = True):
+    self.in_channels, self.out_channels, self.stride, self.padding, self.groups, self.bias = in_channels, out_channels, stride, padding, groups, bias # Wow this is terrible
+
+    if type(kernel_size) == int:
+      self.kernel_size = (kernel_size, kernel_size)
+    else: self.kernel_size = kernel_size
+
+    assert out_channels % groups == 0 and in_channels % groups == 0
+
+    self.weight = Tensor.uniform(out_channels, in_channels // groups, *self.kernel_size)
+    if self.bias:
+      self.bias = Tensor.uniform(1, out_channels, 1, 1)
+    else:
+      self.bias = None
+  
+  def __repr__(self):
+    return f"Conv2d({self.in_channels!r}, {self.out_channels!r}, kernel_size={self.kernel_size!r} stride={self.stride!r}"
+  
+  def __call__(self, x):
+    if self.padding != 0:
+      if self.bias is not None:
+        x = x.pad2d(padding=[self.padding] * 4).conv2d(self.weight, stride=self.stride, groups=self.groups).add(self.bias)
+      else:
+        x = x.pad2d(padding=[self.padding] * 4).conv2d(self.weight, stride=self.stride, groups=self.groups)
+    else:
+      if self.bias is not None:
+        x = x.conv2d(self.weight, stride=self.stride, groups=self.groups).add(self.bias)
+      else:
+        x = x.conv2d(self.weight, stride=self.stride, groups=self.groups)
+
+    return x
diff --git a/examples/yolov3.py b/examples/yolov3.py
new file mode 100644
index 0000000000..eabe527a87
--- /dev/null
+++ b/examples/yolov3.py
@@ -0,0 +1,623 @@
+# https://raw.githubusercontent.com/pjreddie/darknet/master/cfg/yolov3.cfg
+# running 
+
+import os
+GPU = os.getenv("GPU", None) is not None
+import sys
+import io
+import time
+import numpy as np
+np.set_printoptions(suppress=True)
+from tinygrad.tensor import Tensor
+from extra.utils import fetch, get_parameters
+from yolo_nn import Conv2d, Upsample, EmptyLayer, DetectionLayer, LeakyReLU, MaxPool2d
+from tinygrad.nn import BatchNorm2D
+
+import cv2
+from PIL import Image
+
+def show_labels(prediction, confidence = 0.5, num_classes = 80):
+  coco_labels = fetch('https://raw.githubusercontent.com/pjreddie/darknet/master/data/coco.names')
+  coco_labels = coco_labels.decode('utf-8').split('\n')
+
+  prediction = prediction.detach().cpu().data
+
+  conf_mask = (prediction[:,:,4] > confidence)
+  conf_mask = np.expand_dims(conf_mask, 2)
+  prediction = prediction * conf_mask
+
+  def numpy_max(input, dim):
+    # Input -> tensor (10x8)
+    return np.amax(input, axis=dim), np.argmax(input, axis=dim)
+  
+  # Iterate over batches
+  for i in range(prediction.shape[0]):
+    img_pred = prediction[i]
+    max_conf, max_conf_score = numpy_max(img_pred[:,5:5 + num_classes], 1)
+    max_conf_score = np.expand_dims(max_conf_score, axis=1)
+    max_conf = np.expand_dims(max_conf, axis=1)
+    seq = (img_pred[:,:5], max_conf, max_conf_score)
+    image_pred = np.concatenate(seq, axis=1)
+
+    non_zero_ind = np.nonzero(image_pred[:,4])[0] # TODO: Check if this is right
+    image_pred_ = np.reshape(image_pred[np.squeeze(non_zero_ind),:], (-1, 7))
+    try:
+      image_pred_ = np.reshape(image_pred[np.squeeze(non_zero_ind),:], (-1, 7))
+    except:
+      print("No detections found!")
+      pass
+    classes, indexes = np.unique(image_pred_[:, -1], return_index=True)
+    for index, coco_class in enumerate(classes):
+      probability = image_pred_[indexes[index]][4] * 100
+      print("Detected", coco_labels[int(coco_class)], "{:.2f}%".format(probability))
+
+def letterbox_image(img, inp_dim=608):
+  img_w, img_h = img.shape[1], img.shape[0]
+  w, h = inp_dim
+  new_w = int(img_w * min(w/img_w, h/img_h))
+  new_h = int(img_h * min(w/img_w, h/img_h))
+  resized_image = cv2.resize(img, (new_w,new_h), interpolation = cv2.INTER_CUBIC)
+  
+  canvas = np.full((inp_dim[1], inp_dim[0], 3), 128)
+  canvas[(h-new_h)//2:(h-new_h)//2 + new_h,(w-new_w)//2:(w-new_w)//2 + new_w,  :] = resized_image
+  
+  return canvas
+
+def add_boxes(img, prediction):
+  if type(prediction) is int: # no predictions
+    return img
+  coco_labels = fetch('https://raw.githubusercontent.com/pjreddie/darknet/master/data/coco.names')
+  coco_labels = coco_labels.decode('utf-8').split('\n')
+  height, width = img.shape[0:2]
+  scale_factor = 608 / width
+
+  prediction[:,[1,3]] -= (608 - scale_factor * width) / 2
+  prediction[:,[2,4]] -= (608 - scale_factor * height) / 2
+
+  for i in range(prediction.shape[0]):
+    pred = prediction[i]
+    corner1 = tuple(pred[1:3].astype(int))
+    corner2 = tuple(pred[3:5].astype(int))
+    w = corner2[0] - corner1[0]
+    h = corner2[1] - corner1[1]
+    corner2 = (corner2[0] + w, corner2[1] + h)
+    label = coco_labels[int(pred[-1])]
+    img = cv2.rectangle(img, corner1, corner2, (255, 0, 0), 2)
+    t_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, 1 , 1)[0]
+    c2 = corner1[0] + t_size[0] + 3, corner1[1] + t_size[1] + 4
+    img = cv2.rectangle(img, corner1, c2, (255, 0, 0), -1)
+    img = cv2.putText(img, label, (corner1[0], corner1[1] + t_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, 1, [225,255,255], 1)
+  
+  return img
+
+def bbox_iou(box1, box2):
+  """
+  Returns the IoU of two bounding boxes
+  IoU: IoU = Area Of Overlap / Area of Union -> How close the predicted bounding box is
+  to the ground truth bounding box. Higher IoU = Better accuracy
+
+  In training, used to track accuracy. with inference, using to remove duplicate bounding boxes
+  """
+  # Get the coordinates of bounding boxes
+  b1_x1, b1_y1, b1_x2, b1_y2 = box1[:,0], box1[:,1], box1[:,2], box1[:,3]
+  b2_x1, b2_y1, b2_x2, b2_y2 = box2[:,0], box2[:,1], box2[:,2], box2[:,3]
+
+  # get the corrdinates of the intersection rectangle
+  inter_rect_x1 = np.maximum(b1_x1, b2_x1)
+  inter_rect_y1 = np.maximum(b1_y1, b2_y1)
+  inter_rect_x2 = np.maximum(b1_x2, b2_x2)
+  inter_rect_y2 = np.maximum(b1_y2, b2_y2)
+
+  #Intersection area
+  inter_area = np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, 99999) * np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, 99999)
+
+  #Union Area
+  b1_area = (b1_x2 - b1_x1 + 1)*(b1_y2 - b1_y1 + 1)
+  b2_area = (b2_x2 - b2_x1 + 1)*(b2_y2 - b2_y1 + 1)
+
+  iou = inter_area / (b1_area + b2_area - inter_area)
+
+  return iou
+
+
+def process_results(prediction, confidence = 0.9, num_classes = 80, nms_conf = 0.4):
+  prediction = prediction.detach().cpu().data
+  conf_mask = (prediction[:,:,4] > confidence)
+  conf_mask = np.expand_dims(conf_mask, 2)
+  prediction = prediction * conf_mask
+  
+  # Non max suppression
+  box_corner = prediction
+  box_corner[:,:,0] = (prediction[:,:,0] - prediction[:,:,2]/2)
+  box_corner[:,:,1] = (prediction[:,:,1] - prediction[:,:,3]/2)
+  box_corner[:,:,2] = (prediction[:,:,0] + prediction[:,:,2]/2) 
+  box_corner[:,:,3] = (prediction[:,:,1] + prediction[:,:,3]/2)
+  prediction[:,:,:4] = box_corner[:,:,:4]
+
+  batch_size = prediction.shape[0]
+
+  write = False
+
+  # Process img
+  img_pred = prediction[0]
+
+  def numpy_max(input, dim):
+    # Input -> tensor (10x8)
+    return np.amax(input, axis=dim), np.argmax(input, axis=dim)
+  
+  max_conf, max_conf_score = numpy_max(img_pred[:,5:5 + num_classes], 1)
+  max_conf_score = np.expand_dims(max_conf_score, axis=1)
+  max_conf = np.expand_dims(max_conf, axis=1)
+  seq = (img_pred[:,:5], max_conf, max_conf_score)
+  image_pred = np.concatenate(seq, axis=1)
+
+  non_zero_ind = np.nonzero(image_pred[:,4])[0] # TODO: Check if this is right
+  image_pred_ = np.reshape(image_pred[np.squeeze(non_zero_ind),:], (-1, 7))
+  try:
+    image_pred_ = np.reshape(image_pred[np.squeeze(non_zero_ind),:], (-1, 7))
+  except:
+    print("No detections found!")
+    return 0
+
+  if image_pred_.shape[0] == 0:
+    print("No detections found!")
+    return 0
+  
+  def unique(tensor):
+    tensor_np = tensor
+    unique_np = np.unique(tensor_np)
+    return unique_np
+
+  img_classes = unique(image_pred_[:, -1])
+
+  for cls in img_classes:
+    # perform NMS, get the detections with one particular class
+    cls_mask = image_pred_*np.expand_dims(image_pred_[:, -1] == cls, axis=1)
+    class_mask_ind = np.squeeze(np.nonzero(cls_mask[:,-2]))
+    # class_mask_ind = np.nonzero()
+    image_pred_class = np.reshape(image_pred_[class_mask_ind], (-1, 7))
+    
+    # sort the detections such that the entry with the maximum objectness
+    # confidence is at the top
+    conf_sort_index = np.argsort(image_pred_class[:,4])
+    image_pred_class = image_pred_class[conf_sort_index]
+    idx = image_pred_class.shape[0]   #Number of detections
+    
+    for i in range(idx):
+      #Get the IOUs of all boxes that come after the one we are looking at 
+      #in the loop
+      try:
+        ious = bbox_iou(np.expand_dims(image_pred_class[i], axis=0), image_pred_class[i+1:])
+      except ValueError:
+        break
+  
+      except IndexError:
+        break
+  
+      # Zero out all the detections that have IoU > treshhold
+      iou_mask = np.expand_dims((ious < nms_conf), axis=1)
+      image_pred_class[i+1:] *= iou_mask
+  
+      # Remove the non-zero entries
+      non_zero_ind = np.squeeze(np.nonzero(image_pred_class[:,4]))
+      image_pred_class = np.reshape(image_pred_class[non_zero_ind], (-1, 7))    
+
+    batch_ind = np.array([[0]])
+    seq = (batch_ind, image_pred_class)
+    
+    if not write:
+      output = np.concatenate(seq, 1)
+      write = True
+    else:
+      out = np.concatenate(seq, axis=1)
+      output = np.concatenate((output,out))
+  try:
+    return output
+  except:
+    return 0
+
+def imresize(img, w, h):
+  return np.array(Image.fromarray(img).resize((w, h)))
+
+def resize(img, inp_dim=(608, 608)):
+  img_w, img_h = img.shape[1], img.shape[0]
+  w, h = inp_dim
+  new_w = int(img_w * min(w/img_w, h/img_h))
+  new_h = int(img_h * min(w/img_w, h/img_h))
+  resized_image = cv2.resize(img, (new_w,new_h), interpolation = cv2.INTER_CUBIC)
+  
+  canvas = np.full((inp_dim[1], inp_dim[0], 3), 128)
+  canvas[(h-new_h)//2:(h-new_h)//2 + new_h,(w-new_w)//2:(w-new_w)//2 + new_w,  :] = resized_image
+  
+  return canvas
+
+def infer(model, img):
+  img = np.array(img)
+  img = imresize(img, 608, 608)
+  # img = resize(img)
+  img = img[:,:,::-1].transpose((2,0,1))
+  img = img[np.newaxis,:,:,:]/255.0
+
+  prediction = model.forward(Tensor(img))
+  return prediction
+
+
+def parse_cfg(cfg):
+  # Return a list of blocks
+  lines = cfg.decode("utf-8").split('\n')
+  lines = [x for x in lines if len(x) > 0]
+  lines = [x for x in lines if x[0] != '#']
+  lines = [x.rstrip().lstrip() for x in lines]
+
+  block = {}
+  blocks = []
+
+  for line in lines:
+    if line[0] == "[":
+      if len(block) != 0:
+        blocks.append(block)
+        block = {}
+      block["type"] = line[1:-1].rstrip()
+    else:
+      key,value = line.split("=")
+      block[key.rstrip()] = value.lstrip()
+  blocks.append(block)
+
+  return blocks
+
+# TODO: Speed up this function, avoid copying stuff from GPU to CPU
+def predict_transform(prediction, inp_dim, anchors, num_classes):
+  batch_size = prediction.shape[0]
+  stride = inp_dim // prediction.shape[2]
+  grid_size = inp_dim // stride
+  bbox_attrs = 5 + num_classes
+  num_anchors = len(anchors)
+  
+  prediction = prediction.reshape(shape=(batch_size, bbox_attrs*num_anchors, grid_size*grid_size))
+  # Original PyTorch: transpose(1, 2) -> For some reason numpy.transpose order has to be reversed?
+  prediction = prediction.transpose(order=(0, 2, 1))
+  prediction = prediction.reshape(shape=(batch_size, grid_size*grid_size*num_anchors, bbox_attrs))
+  
+  # st = time.time()
+  prediction_cpu = prediction.cpu().data
+  # print('put on CPU in %.2f s' % (time.time() - st))
+
+  anchors = [(a[0]/stride, a[1]/stride) for a in anchors]
+  #Sigmoid the  centre_X, centre_Y. and object confidence
+  # TODO: Fix this
+  def dsigmoid(data):
+    return 1/(1+np.exp(-data))
+  
+  prediction_cpu[:,:,0] = dsigmoid(prediction_cpu[:,:,0])
+  prediction_cpu[:,:,1] = dsigmoid(prediction_cpu[:,:,1])
+  prediction_cpu[:,:,4] = dsigmoid(prediction_cpu[:,:,4])
+  
+  # Add the center offsets
+  grid = np.arange(grid_size)
+  a, b = np.meshgrid(grid, grid)
+
+  x_offset = a.reshape((-1, 1))
+  y_offset = b.reshape((-1, 1))
+
+  x_y_offset = np.concatenate((x_offset, y_offset), 1)
+  x_y_offset = np.tile(x_y_offset, (1, num_anchors))
+  x_y_offset = x_y_offset.reshape((-1,2))
+  x_y_offset = np.expand_dims(x_y_offset, 0)
+
+  prediction_cpu[:,:,:2] += x_y_offset
+
+  anchors = np.tile(anchors, (grid_size*grid_size, 1))
+  anchors = np.expand_dims(anchors, 0)
+
+  prediction_cpu[:,:,2:4] = np.exp(prediction_cpu[:,:,2:4])*anchors
+  prediction_cpu[:,:,5: 5 + num_classes] = dsigmoid((prediction_cpu[:,:, 5 : 5 + num_classes]))
+  prediction_cpu[:,:,:4] *= stride
+  prediction.gpu_()
+
+  return Tensor(prediction_cpu)
+
+
+class Darknet:
+  def __init__(self, cfg):
+    self.blocks = parse_cfg(cfg)
+    self.net_info, self.module_list = self.create_modules(self.blocks)
+    print("Modules length:", len(self.module_list))
+
+  def create_modules(self, blocks):
+    net_info = blocks[0] # Info about model hyperparameters
+    prev_filters = 3
+    filters = None
+    output_filters = []
+    module_list = []
+    ## module
+    for index, x in enumerate(blocks[1:]):
+      module_type = x["type"]
+      module = []
+      if module_type == "convolutional":
+        try:
+          batch_normalize = int(x["batch_normalize"])
+          bias = False
+        except:
+          batch_normalize = 0
+          bias = True
+
+        # layer
+        activation = x["activation"]
+        filters = int(x["filters"])
+        padding = int(x["pad"])
+        if padding:
+          pad = (int(x["size"]) - 1) // 2
+        else:
+          pad = 0
+        
+        conv = Conv2d(prev_filters, filters, int(x["size"]), int(x["stride"]), pad, bias = bias)
+        module.append(conv)
+
+        # BatchNorm2d
+        if batch_normalize:
+          bn = BatchNorm2D(filters, eps=1e-05, training=True, track_running_stats=True)
+          module.append(bn)
+
+        # LeakyReLU activation
+        if activation == "leaky":
+          module.append(LeakyReLU(0.1))
+      
+      # TODO: Add tiny model
+      elif module_type == "maxpool":
+        size = int(x["size"])
+        stride = int(x["stride"])
+        maxpool = MaxPool2d(size, stride)
+        module.append(maxpool)
+
+      elif module_type == "upsample":
+        upsample = Upsample(scale_factor = 2, mode = "nearest")
+        module.append(upsample)
+      
+      elif module_type == "route":
+        x["layers"] = x["layers"].split(",")
+        # Start of route
+        start = int(x["layers"][0])
+        # End if it exists
+        try:
+          end = int(x["layers"][1])
+        except:
+          end = 0
+        if start > 0: start = start - index
+        if end > 0: end = end - index
+        route = EmptyLayer()
+        module.append(route)
+        if end < 0:
+          filters = output_filters[index + start] + output_filters[index + end]
+        else:
+          filters = output_filters[index + start]
+        
+      # Shortcut corresponds to skip connection
+      elif module_type == "shortcut":
+        module.append(EmptyLayer())
+      
+      elif module_type == "yolo":
+        mask = x["mask"].split(",")
+        mask = [int(x) for x in mask]
+
+        anchors = x["anchors"].split(",")
+        anchors = [int(a) for a in anchors]
+        anchors = [(anchors[i], anchors[i+1]) for i in range(0, len(anchors), 2)]
+        anchors = [anchors[i] for i in mask]
+
+        detection = DetectionLayer(anchors)
+        module.append(detection)
+      
+      # Append to module_list
+      module_list.append(module)
+      if filters is not None:
+        prev_filters = filters
+      output_filters.append(filters)
+    
+    return (net_info, module_list)
+  
+  def dump_weights(self):
+    for i in range(len(self.module_list)):
+      module_type = self.blocks[i + 1]["type"]
+      if module_type == "convolutional":
+        print(self.blocks[i + 1]["type"], "weights", i)
+        model = self.module_list[i]
+        conv = model[0]
+        print(conv.weight.cpu().data[0][0][0])
+        if conv.bias is not None:
+          print("biases")
+          print(conv.bias.shape)
+          print(conv.bias.cpu().data[0][0:5])
+        else:
+          print("None biases for layer", i)
+  
+  def load_weights(self, url):
+    weights = fetch(url)
+    # First 5 values (major, minor, subversion, Images seen)
+    header = np.frombuffer(weights, dtype=np.int32, count = 5)
+    self.seen = header[3]
+
+    def numel(tensor):
+      from functools import reduce
+      return reduce(lambda x, y: x*y, tensor.shape)
+
+    weights = np.frombuffer(weights, dtype=np.float32)
+    weights = weights[5:]
+
+    ptr = 0
+    for i in range(len(self.module_list)):
+      module_type = self.blocks[i + 1]["type"]
+
+      if module_type == "convolutional":
+        model = self.module_list[i]
+        try: # we have batchnorm, load conv weights without biases, and batchnorm values
+          batch_normalize = int(self.blocks[i + 1]["batch_normalize"])
+        except: # no batchnorm, load conv weights + biases
+          batch_normalize = 0
+        
+        conv = model[0]
+
+        if (batch_normalize):
+          bn = model[1]
+
+          # Get the number of weights of batchnorm
+          num_bn_biases = numel(bn.bias)
+
+          # Load weights
+          bn_biases = Tensor(weights[ptr:ptr + num_bn_biases])
+          ptr += num_bn_biases
+
+          bn_weights = Tensor(weights[ptr:ptr+num_bn_biases])
+          ptr += num_bn_biases
+
+          bn_running_mean = Tensor(weights[ptr:ptr+num_bn_biases])
+          ptr += num_bn_biases
+
+          bn_running_var = Tensor(weights[ptr:ptr+num_bn_biases])
+          ptr += num_bn_biases
+
+          # Cast the loaded weights into dims of model weights
+          bn_biases = bn_biases.reshape(shape=tuple(bn.bias.shape))
+          bn_weights = bn_weights.reshape(shape=tuple(bn.weight.shape))
+          bn_running_mean = bn_running_mean.reshape(shape=tuple(bn.running_mean.shape))
+          bn_running_var = bn_running_var.reshape(shape=tuple(bn.running_var.shape))
+
+          # Copy data
+          bn.bias = bn_biases
+          bn.weight = bn_weights
+          bn.running_mean = bn_running_mean
+          bn.running_var = bn_running_var
+        else:
+          # load biases of the conv layer
+          num_biases = numel(conv.bias)
+
+          # Load wieghts
+          conv_biases = Tensor(weights[ptr: ptr+num_biases])
+          ptr += num_biases
+
+          # Reshape
+          conv_biases = conv_biases.reshape(shape=tuple(conv.bias.shape))
+
+          # Copy
+          conv.bias = conv_biases
+        
+        # Load weighys for conv layers
+        num_weights = numel(conv.weight)
+
+        conv_weights = Tensor(weights[ptr:ptr+num_weights])
+        ptr += num_weights
+
+        conv_weights = conv_weights.reshape(shape=tuple(conv.weight.shape))
+        conv.weight = conv_weights
+
+
+
+  
+  def forward(self, x):
+    modules = self.blocks[1:]
+    outputs = {} # Cached outputs for route layer
+    write = 0
+
+    for i, module in enumerate(modules):
+      module_type = (module["type"])
+      st = time.time()
+      if module_type == "convolutional" or module_type == "upsample":
+        for index, layer in enumerate(self.module_list[i]):
+          x = layer(x)
+      
+      elif module_type == "route":
+        layers = module["layers"]
+        layers = [int(a) for a in layers]
+
+        if (layers[0]) > 0:
+          layers[0] = layers[0] - i
+        if len(layers) == 1:
+          x = outputs[i + (layers[0])]
+        else:
+          if (layers[1]) > 0: layers[1] = layers[1] - i
+          
+          map1 = outputs[i + layers[0]]
+          map2 = outputs[i + layers[1]]
+
+          x = Tensor(np.concatenate((map1.cpu().data, map2.cpu().data), 1))
+      
+      elif module_type == "shortcut":
+        from_ = int(module["from"])
+        x = outputs[i - 1] + outputs[i + from_]
+      
+      elif module_type == "yolo":
+        anchors = self.module_list[i][0].anchors
+        inp_dim = int(self.net_info["height"])
+        # inp_dim = 416
+
+        num_classes = int(module["classes"])
+        # Transform
+        x = predict_transform(x, inp_dim, anchors, num_classes)
+        if not write:
+          detections = x
+          write = 1
+        else:
+          detections = Tensor(np.concatenate((detections.cpu().data, x.cpu().data), 1))
+      
+      # print(module_type, 'layer took %.2f s' % (time.time() - st))
+      outputs[i] = x
+    
+    return detections # Return detections
+
+if __name__ == "__main__":
+  cfg = fetch('https://raw.githubusercontent.com/pjreddie/darknet/master/cfg/yolov3.cfg') # normal model
+  # cfg = fetch('https://raw.githubusercontent.com/pjreddie/darknet/master/cfg/yolov3-tiny.cfg') # tiny model
+
+  # Make deterministic
+  np.random.seed(1337)
+
+  # Start model
+  model = Darknet(cfg)
+
+  print("Loading weights file (237MB). This might take a while…")
+  model.load_weights('https://pjreddie.com/media/files/yolov3.weights') # normal model
+  # model.load_weights('https://pjreddie.com/media/files/yolov3-tiny.weights') # tiny model
+
+  if GPU:
+    params = get_parameters(model)
+    [x.gpu_() for x in params]
+
+  url = sys.argv[1]
+  # url = "https://github.com/ayooshkathuria/pytorch-yolo-v3/raw/master/dog-cycle-car.png"
+
+  img = None
+  # We use cv2 because for some reason, cv2 imread produces better results?
+  if url == 'webcam':
+    cap = cv2.VideoCapture(0)
+    cap.set(cv2.CAP_PROP_BUFFERSIZE, 1)
+    while 1:
+      _ = cap.grab() # discard one frame to circumvent capture buffering
+      ret, frame = cap.read()
+      img = Image.fromarray(frame[:, :, [2,1,0]])
+
+      prediction = infer(model, img)
+      prediction = process_results(prediction)
+
+      boxes = add_boxes(imresize(np.array(img), 608, 608), prediction)
+      boxes = cv2.cvtColor(boxes, cv2.COLOR_RGB2BGR)
+      cv2.imshow('yolo', boxes)
+      if cv2.waitKey(1) & 0xFF == ord('q'):
+        break
+    cap.release()
+    cv2.destroyAllWindows()
+  elif url.startswith('http'):
+    img_stream = io.BytesIO(fetch(url))
+    img = cv2.imdecode(np.fromstring(img_stream.read(), np.uint8), 1)
+  else:
+    img = cv2.imread(url)
+  
+  # Predict
+  st = time.time()
+  print('running inference…')
+  prediction = infer(model, img)
+  print('did inference in %.2f s' % (time.time() - st))
+
+  prediction = process_results(prediction)
+  # print(prediction)
+  boxes = add_boxes(imresize(img, 608, 608), prediction)
+  # Save img
+  cv2.imwrite('boxes.jpg', boxes)
diff --git a/examples/yolov5.py b/examples/yolov5.py
deleted file mode 100755
index 627b02323d..0000000000
--- a/examples/yolov5.py
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/usr/bin/env python3
-import io
-import pickle
-from extra.utils import fetch, my_unpickle
-
-if __name__ == "__main__":
-  dat = fetch('https://github.com/ultralytics/yolov5/releases/download/v4.0/yolov5s.pt')
-
-  import zipfile
-  fp = zipfile.ZipFile(io.BytesIO(dat))
-  #fp.printdir()
-  data = fp.read('archive/data.pkl')
-
-  # yolo specific
-  ret, out = my_unpickle(io.BytesIO(data))
-  d = ret['model'].yaml
-  for i, (f, n, m, args) in enumerate(d['backbone'] + d['head']):
-    tm = ret['model']._modules['model'][i]
-    print(i, f, n, m, args, tm._modules.keys())
-    # Focus, Conv, BottleneckCSP, SPP, Concat, Detect
-    #for k,v in tm._modules.items():
-    #  print("   ", k, v)
-    if m in "Focus":
-      conv = tm._modules['conv']
-      print("   ", conv._modules)
-    if m in "Conv":
-      conv, bn = tm._modules['conv'], tm._modules['bn']
-      print("   ", conv)
-      #print(bn)
-
-
-
diff --git a/test/test_nn.py b/test/test_nn.py
index c502f9f871..1eaa8aeeec 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -47,7 +47,7 @@ class TestNN(unittest.TestCase):
     np.testing.assert_allclose(bn.running_mean.data, tbn.running_mean.detach().numpy(), rtol=1e-5)
 
     # TODO: this is failing
-    #np.testing.assert_allclose(bn.running_var.data, tbn.running_var.detach().numpy(), rtol=1e-5)
+    # np.testing.assert_allclose(bn.running_var.data, tbn.running_var.detach().numpy(), rtol=1e-5)
 
   def test_batchnorm2d_training(self):
     self.test_batchnorm2d(True)
diff --git a/tinygrad/nn.py b/tinygrad/nn.py
index 17a99d5941..2a705e332d 100644
--- a/tinygrad/nn.py
+++ b/tinygrad/nn.py
@@ -18,6 +18,7 @@ class BatchNorm2D:
     if self.track_running_stats:
       self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * batch_mean
       self.running_var = (1 - self.momentum) * self.running_var + self.momentum * batch_var
+      if self.num_batches_tracked is None: self.num_batches_tracked = Tensor.zeros(1, requires_grad=False)
       self.num_batches_tracked += 1
 
     if self.training:
diff --git a/tinygrad/ops_gpu.py b/tinygrad/ops_gpu.py
index ce8d1c2110..7046d215b1 100644
--- a/tinygrad/ops_gpu.py
+++ b/tinygrad/ops_gpu.py
@@ -371,7 +371,7 @@ class Conv2D(Function):
     ys,xs = ctx.stride
     bs,cin_,iy,ix = x.shape
     oy,ox = (iy-(H-ys))//ys, (ix-(W-xs))//xs
-    assert cin*ctx.groups == cin_
+    if cin*ctx.groups != cin_: raise Exception(f"Input Tensor shape {x.shape} does not match the shape of the weights {w.shape}. ({cin*ctx.groups} vs. {cin_})")
     assert cout % ctx.groups == 0
     rcout = cout//ctx.groups