* bump tg

* bump tg

* assign

* bump

* cpu llvm

* frame buffer updated in place, no need to return

* don't bake in stale pointers

* fix update image output indices

* lint

* bump
This commit is contained in:
Armand du Parc Locmaria
2026-04-02 09:16:11 -07:00
committed by GitHub
parent cb32793300
commit 55c3885742
4 changed files with 13 additions and 25 deletions

View File

@@ -18,7 +18,7 @@ def estimate_pickle_max_size(onnx_size):
tg_flags = {
'larch64': 'DEV=QCOM FLOAT16=1 NOLOCALS=1 JIT_BATCH_SIZE=0',
'Darwin': f'DEV=CPU THREADS=0 HOME={os.path.expanduser("~")}', # tinygrad calls brew which needs a $HOME in the env
}.get(arch, 'DEV=CPU CPU_LLVM=1 THREADS=0')
}.get(arch, 'DEV=CPU:LLVM THREADS=0')
# Get model metadata
for model_name in ['driving_vision', 'driving_off_policy', 'driving_on_policy', 'dmonitoring_model']:

View File

@@ -94,11 +94,11 @@ def make_frame_prepare(cam_w, cam_h, model_w, model_h):
def make_update_img_input(frame_prepare, model_w, model_h):
def update_img_input_tinygrad(tensor, frame, M_inv):
def update_img_input_tinygrad(frame_buffer, frame, M_inv):
M_inv = M_inv.to(Device.DEFAULT)
new_img = frame_prepare(frame, M_inv)
full_buffer = tensor[6:].cat(new_img, dim=0).contiguous()
return full_buffer, Tensor.cat(full_buffer[:6], full_buffer[-6:], dim=0).contiguous().reshape(1, 12, model_h//2, model_w//2)
frame_buffer.assign(frame_buffer[6:].cat(new_img, dim=0).contiguous())
return Tensor.cat(frame_buffer[:6], frame_buffer[-6:], dim=0).contiguous().reshape(1, 12, model_h//2, model_w//2)
return update_img_input_tinygrad
@@ -107,9 +107,9 @@ def make_update_both_imgs(frame_prepare, model_w, model_h):
def update_both_imgs_tinygrad(calib_img_buffer, new_img, M_inv,
calib_big_img_buffer, new_big_img, M_inv_big):
calib_img_buffer, calib_img_pair = update_img(calib_img_buffer, new_img, M_inv)
calib_big_img_buffer, calib_big_img_pair = update_img(calib_big_img_buffer, new_big_img, M_inv_big)
return calib_img_buffer, calib_img_pair, calib_big_img_buffer, calib_big_img_pair
calib_img_pair = update_img(calib_img_buffer, new_img, M_inv)
calib_big_img_pair = update_img(calib_big_img_buffer, new_big_img, M_inv_big)
return calib_img_pair, calib_big_img_pair
return update_both_imgs_tinygrad
@@ -136,29 +136,18 @@ def compile_modeld_warp(cam_w, cam_h):
full_buffer = Tensor.zeros(IMG_BUFFER_SHAPE, dtype='uint8').contiguous().realize()
big_full_buffer = Tensor.zeros(IMG_BUFFER_SHAPE, dtype='uint8').contiguous().realize()
full_buffer_np = np.zeros(IMG_BUFFER_SHAPE, dtype=np.uint8)
big_full_buffer_np = np.zeros(IMG_BUFFER_SHAPE, dtype=np.uint8)
for i in range(10):
new_frame_np = (32 * np.random.randn(yuv_size).astype(np.float32) + 128).clip(0, 255).astype(np.uint8)
img_inputs = [full_buffer,
Tensor.from_blob(new_frame_np.ctypes.data, (yuv_size,), dtype='uint8').realize(),
Tensor(np.random.randint(0, 256, yuv_size, dtype=np.uint8)).realize(),
Tensor(Tensor.randn(3, 3).mul(8).realize().numpy(), device='NPY')]
new_big_frame_np = (32 * np.random.randn(yuv_size).astype(np.float32) + 128).clip(0, 255).astype(np.uint8)
big_img_inputs = [big_full_buffer,
Tensor.from_blob(new_big_frame_np.ctypes.data, (yuv_size,), dtype='uint8').realize(),
Tensor(np.random.randint(0, 256, yuv_size, dtype=np.uint8)).realize(),
Tensor(Tensor.randn(3, 3).mul(8).realize().numpy(), device='NPY')]
inputs = img_inputs + big_img_inputs
Device.default.synchronize()
inputs_np = [x.numpy() for x in inputs]
inputs_np[0] = full_buffer_np
inputs_np[3] = big_full_buffer_np
st = time.perf_counter()
out = update_img_jit(*inputs)
full_buffer = out[0].contiguous().realize().clone()
big_full_buffer = out[2].contiguous().realize().clone()
_ = update_img_jit(*inputs)
mt = time.perf_counter()
Device.default.synchronize()
et = time.perf_counter()
@@ -183,7 +172,7 @@ def compile_dm_warp(cam_w, cam_h):
warp_dm_jit = TinyJit(warp_dm, prune=True)
for i in range(10):
inputs = [Tensor.from_blob((32 * Tensor.randn(yuv_size,) + 128).cast(dtype='uint8').realize().numpy().ctypes.data, (yuv_size,), dtype='uint8'),
inputs = [Tensor(np.random.randint(0, 256, yuv_size, dtype=np.uint8)).realize(),
Tensor(Tensor.randn(3, 3).mul(8).realize().numpy(), device='NPY')]
Device.default.synchronize()
st = time.perf_counter()

View File

@@ -222,8 +222,7 @@ class ModelState:
out = self.update_imgs(self.img_queues['img'], self.full_frames['img'], self.transforms['img'],
self.img_queues['big_img'], self.full_frames['big_img'], self.transforms['big_img'])
self.img_queues['img'], self.img_queues['big_img'] = out[0].realize(), out[2].realize()
vision_inputs = {'img': out[1], 'big_img': out[3]}
vision_inputs = {'img': out[0], 'big_img': out[1]}
if prepare_only:
return None