Profile GPU and CPU copying. (#182)

Moving memory is slow, and therefor monitoring the time spent converting and limiting the number of copy operations can improve performance.
2026-06-13 00:15:35 +08:00 · 2020-12-12 21:15:47 +01:00
parent 8e8cbc74b3
commit bf9ba8718a
1 changed files with 12 additions and 10 deletions
--- a/tinygrad/tensor.py
+++ b/tinygrad/tensor.py
@@ -157,11 +157,12 @@ class Tensor:

  def cpu(self):
    if self.gpu:
-      ret = Tensor(np.empty(self.shape, dtype=np.float32), gpu=False)
-      cl.enqueue_copy(cl_queue, ret.data, self.data.cl, is_blocking=True)
-      if self.grad:
-        ret.grad = self.grad.cpu()
-      return ret
+      with ProfileOp("toCPU", [self]):
+        ret = Tensor(np.empty(self.shape, dtype=np.float32), gpu=False)
+        cl.enqueue_copy(cl_queue, ret.data, self.data.cl, is_blocking=True)
+        if self.grad:
+          ret.grad = self.grad.cpu()
+        return ret
    else:
      return self

@@ -173,11 +174,12 @@ class Tensor:
    if not GPU:
      raise Exception("No GPU Support, install pyopencl")
    if not self.gpu:
-      require_init_gpu()
-      ret = Tensor(GPUBuffer(self.shape, self.data))
-      if self.grad:
-        ret.grad = self.grad.cuda()
-      return ret
+      with ProfileOp("toGPU", [self]):
+        require_init_gpu()
+        ret = Tensor(GPUBuffer(self.shape, self.data))
+        if self.grad:
+          ret.grad = self.grad.cuda()
+        return ret
    else:
      return self