Merge branch 'master' into tinygrad-sync-4/25

commit
2026-06-22 21:02:06 +08:00 · 2026-05-17 21:20:04 -04:00 · 2026-05-03 11:48:11 -07:00 · 2026-04-28 11:34:56 -07:00 · 2026-04-28 11:34:04 -07:00 · 2026-04-27 20:27:54 -07:00
18 changed files with 596 additions and 418 deletions
@@ -172,7 +172,7 @@ jobs:
            output_file="${{ env.MODELS_DIR }}/${base_name}_tinygrad.pkl"
            echo "Compiling: $onnx_file -> $output_file"
-            QCOM=1 python3 "${{ env.TINYGRAD_PATH }}/examples/openpilot/compile3.py" "$onnx_file" "$output_file"
+            DEV=QCOM FLOAT16=1 NOLOCALS=1 JIT_BATCH_SIZE=0 IMAGE=2 python3 "${{ env.TINYGRAD_PATH }}/examples/openpilot/compile3.py" "$onnx_file" "$output_file"
            DEV=QCOM FLOAT16=1 NOLOCALS=1 JIT_BATCH_SIZE=0 python3 "${{ env.MODELS_DIR }}/../get_model_metadata.py" "$onnx_file" || true
          done
@@ -1,10 +1,23 @@
 import glob
 import json
 import os
 from itertools import product
 from SCons.Script import Value
 from openpilot.common.file_chunker import chunk_file, get_chunk_paths
 from openpilot.common.transformations.camera import _ar_ox_fisheye, _os_fisheye
 from openpilot.common.transformations.model import MEDMODEL_INPUT_SIZE, DM_INPUT_SIZE
 from openpilot.selfdrive.modeld.constants import ModelConstants
 from openpilot.selfdrive.modeld.helpers import CompileConfig
 from tinygrad import Device
 CAMERA_CONFIGS = [
  (_ar_ox_fisheye.width, _ar_ox_fisheye.height),  # tici: 1928x1208
  (_os_fisheye.width, _os_fisheye.height),        # mici: 1344x760
 ]
 MODELD_CONFIGS = [CompileConfig(cam_w, cam_h, prepare_only, 'driving_')
                  for (cam_w, cam_h), prepare_only in product(CAMERA_CONFIGS, [True, False])]
 DM_WARP_CONFIGS = [CompileConfig(cam_w, cam_h, True, 'dm_') for cam_w, cam_h in CAMERA_CONFIGS]
 Import('env', 'arch')
 chunker_file = File("#common/file_chunker.py")
 lenv = env.Clone()
@@ -16,18 +29,17 @@ tinygrad_files = ["#"+x for x in glob.glob(env.Dir("#tinygrad_repo").relpath + "
 def estimate_pickle_max_size(onnx_size):
  return 1.2 * onnx_size  + 10 * 1024 * 1024  # 20% + 10MB is plenty
 # THREADS=0 is need to prevent bug: https://github.com/tinygrad/tinygrad/issues/14689
 # get fastest TG config
 available = set(Device.get_available_devices())
-# FIXME-SP: reset when we bump tg
+if 'CUDA' in available:
 if False:  # 'CUDA' in available:
  tg_backend = 'CUDA'
  tg_flags = f'DEV={tg_backend}'
 elif 'QCOM' in available:
  tg_backend = 'QCOM'
-  tg_flags = f'DEV={tg_backend} FLOAT16=1 NOLOCALS=1 JIT_BATCH_SIZE=0'
+  tg_flags = f'DEV={tg_backend} FLOAT16=1 NOLOCALS=1 JIT_BATCH_SIZE=0 OPENPILOT_HACKS=1'
 else:
-  tg_backend = 'CPU' if arch == 'Darwin' else 'CPU CPU_LLVM=1'  # FIXME-SP: reset when we bump tg
+  tg_backend = 'CPU' if arch == 'Darwin' else 'CPU:LLVM'
  # THREADS=0 is need to prevent bug: https://github.com/tinygrad/tinygrad/issues/14689
  tg_flags = f'DEV={tg_backend} THREADS=0'
 def write_tg_compiled_flags(target, source, env):
@@ -54,14 +66,35 @@ for model_name in ['driving_vision', 'driving_policy', 'dmonitoring_model']:
 image_flag = {
     'larch64': 'IMAGE=2',
 }.get(arch, 'IMAGE=0')
-script_files = [File(Dir("#selfdrive/modeld").File("compile_warp.py").abspath)]
+modeld_dir = Dir("#selfdrive/modeld").abspath
-compile_warp_cmd = f'{tg_flags} {mac_brew_string} python3 {Dir("#selfdrive/modeld").abspath}/compile_warp.py '
+compile_modeld_script = [File(f"{modeld_dir}/compile_modeld.py")]
-from openpilot.common.transformations.camera import _ar_ox_fisheye, _os_fisheye
+compile_dm_warp_script = [File(f"{modeld_dir}/compile_dm_warp.py")]
-warp_targets = []
+driving_onnx_deps = [File(f"models/{m}.onnx").abspath for m in ['driving_vision', 'driving_policy']]
-for cam in [_ar_ox_fisheye, _os_fisheye]:
+driving_metadata_deps = [File(f"models/{m}_metadata.pkl").abspath for m in ['driving_vision', 'driving_policy']]
-  w, h = cam.width, cam.height
+
-  warp_targets += [File(f"models/warp_{w}x{h}_tinygrad.pkl").abspath, File(f"models/dm_warp_{w}x{h}_tinygrad.pkl").abspath]
+model_w, model_h = MEDMODEL_INPUT_SIZE
-lenv.Command(warp_targets, tinygrad_files + script_files + [compiled_flags_node], compile_warp_cmd)
+frame_skip = ModelConstants.MODEL_RUN_FREQ // ModelConstants.MODEL_CONTEXT_FREQ
 for cfg in MODELD_CONFIGS:
  cmd = (f'{tg_flags} {mac_brew_string} {image_flag} python3 {modeld_dir}/compile_modeld.py '
         f'--model-size {model_w}x{model_h} '
         f'--nv12 {",".join(str(x) for x in cfg.nv12)} '
         f'--vision-onnx {File("models/driving_vision.onnx").abspath} '
         f'--policy-onnx {File("models/driving_policy.onnx").abspath} '
         f'--output {cfg.pkl_path} --frame-skip {frame_skip}'
         + (' --prepare-only' if cfg.prepare_only else ''))
  node = lenv.Command(cfg.pkl_path, tinygrad_files + compile_modeld_script + driving_onnx_deps + driving_metadata_deps + [chunker_file, compiled_flags_node], cmd)
  onnx_sizes_sum = sum(os.path.getsize(f) for f in driving_onnx_deps)
  chunk_targets = get_chunk_paths(cfg.pkl_path, estimate_pickle_max_size(onnx_sizes_sum))
  def do_chunk(target, source, env, pkl=cfg.pkl_path, chunks=chunk_targets):
    chunk_file(pkl, chunks)
  lenv.Command(chunk_targets, node, do_chunk)
 dm_w, dm_h = DM_INPUT_SIZE
 for cfg in DM_WARP_CONFIGS:
  cmd = (f'{tg_flags} {mac_brew_string} {image_flag} python3 {modeld_dir}/compile_dm_warp.py '
         f'--nv12 {",".join(str(x) for x in cfg.nv12)} --warp-to {dm_w}x{dm_h} '
         f'--output {cfg.pkl_path}')
  lenv.Command(cfg.pkl_path, tinygrad_files + compile_dm_warp_script + compile_modeld_script + [compiled_flags_node], cmd)
 def tg_compile(flags, model_name):
  pythonpath_string = 'PYTHONPATH="${PYTHONPATH}:' + env.Dir("#tinygrad_repo").abspath + '"'
@@ -82,7 +115,4 @@ def tg_compile(flags, model_name):
    do_chunk,
  )
-# Compile small models
+tg_compile(tg_flags, 'dmonitoring_model')
 for model_name in ['driving_vision', 'driving_policy', 'dmonitoring_model']:
  tg_compile(tg_flags, model_name)
@@ -0,0 +1,54 @@
 #!/usr/bin/env python3
 import argparse
 import pickle
 import time
 from tinygrad.tensor import Tensor
 from tinygrad.device import Device
 from tinygrad.engine.jit import TinyJit
 from openpilot.selfdrive.modeld.compile_modeld import NV12Frame, warp_perspective_tinygrad, _parse_size, _parse_nv12
 def make_warp_dm(nv12: NV12Frame, dm_w, dm_h):
  cam_w, cam_h, stride, _, _, _ = nv12
  stride_pad = stride - cam_w
  def warp_dm(input_frame, M_inv):
    M_inv = M_inv.to(Device.DEFAULT).realize()
    return warp_perspective_tinygrad(input_frame[:cam_h*stride], M_inv,
                                     (dm_w, dm_h), (cam_h, cam_w), stride_pad).reshape(-1, dm_h * dm_w)
  return warp_dm
 def compile_dm_warp(nv12: NV12Frame, dm_w, dm_h, pkl_path):
  print(f"Compiling DM warp for {nv12.width}x{nv12.height} -> {dm_w}x{dm_h}...")
  warp_dm_jit = TinyJit(make_warp_dm(nv12, dm_w, dm_h), prune=True)
  for i in range(10):
    frame = Tensor.randint(nv12.size, low=0, high=256, dtype='uint8').realize()
    M_inv = Tensor(Tensor.randn(3, 3).mul(8).realize().numpy(), device='NPY')
    Device.default.synchronize()
    st = time.perf_counter()
    warp_dm_jit(frame, M_inv).realize()
    mt = time.perf_counter()
    Device.default.synchronize()
    et = time.perf_counter()
    print(f"  [{i+1}/10] enqueue {(mt-st)*1e3:6.2f} ms -- total {(et-st)*1e3:6.2f} ms")
  with open(pkl_path, "wb") as f:
    pickle.dump(warp_dm_jit, f)
  print(f"  Saved to {pkl_path}")
 if __name__ == "__main__":
  p = argparse.ArgumentParser()
  p.add_argument('--nv12', type=_parse_nv12, required=True,
                 help=f'NV12 frame layout: {",".join(NV12Frame._fields)}')
  p.add_argument('--warp-to', type=_parse_size, required=True, help='DM input WxH')
  p.add_argument('--output', required=True)
  args = p.parse_args()
  dm_w, dm_h = args.warp_to
  compile_dm_warp(args.nv12, dm_w, dm_h, args.output)
@@ -0,0 +1,253 @@
 #!/usr/bin/env python3
 import argparse
 import pickle
 import time
 from functools import partial
 from collections import namedtuple
 import numpy as np
 from tinygrad.tensor import Tensor
 from tinygrad.helpers import Context
 from tinygrad.device import Device
 from tinygrad.engine.jit import TinyJit
 from tinygrad.nn.onnx import OnnxRunner
 # https://github.com/tinygrad/tinygrad/issues/15682
 from tinygrad.uop.ops import UOp, Ops
 _orig = UOp.__reduce__
 UOp.__reduce__ = lambda self: (UOp.unique, ()) if self.op is Ops.UNIQUE else _orig(self)
 NV12Frame = namedtuple("NV12Frame", ['width', 'height', 'stride', 'y_height', 'uv_height', 'size'])
 UV_SCALE_MATRIX = np.array([[0.5, 0, 0], [0, 0.5, 0], [0, 0, 1]], dtype=np.float32)
 UV_SCALE_MATRIX_INV = np.linalg.inv(UV_SCALE_MATRIX)
 def warp_perspective_tinygrad(src_flat, M_inv, dst_shape, src_shape, stride_pad):
  w_dst, h_dst = dst_shape
  h_src, w_src = src_shape
  x = Tensor.arange(w_dst).reshape(1, w_dst).expand(h_dst, w_dst).reshape(-1)
  y = Tensor.arange(h_dst).reshape(h_dst, 1).expand(h_dst, w_dst).reshape(-1)
  # inline 3x3 matmul as elementwise to avoid reduce op (enables fusion with gather)
  src_x = M_inv[0, 0] * x + M_inv[0, 1] * y + M_inv[0, 2]
  src_y = M_inv[1, 0] * x + M_inv[1, 1] * y + M_inv[1, 2]
  src_w = M_inv[2, 0] * x + M_inv[2, 1] * y + M_inv[2, 2]
  src_x = src_x / src_w
  src_y = src_y / src_w
  x_nn_clipped = Tensor.round(src_x).clip(0, w_src - 1).cast('int')
  y_nn_clipped = Tensor.round(src_y).clip(0, h_src - 1).cast('int')
  idx = y_nn_clipped * (w_src + stride_pad) + x_nn_clipped
  return src_flat[idx]
 def frames_to_tensor(frames):
  H = (frames.shape[0] * 2) // 3
  W = frames.shape[1]
  in_img1 = Tensor.cat(frames[0:H:2, 0::2],
                       frames[1:H:2, 0::2],
                       frames[0:H:2, 1::2],
                       frames[1:H:2, 1::2],
                       frames[H:H+H//4].reshape((H//2, W//2)),
                       frames[H+H//4:H+H//2].reshape((H//2, W//2)), dim=0).reshape((6, H//2, W//2))
  return in_img1
 def make_frame_prepare(nv12: NV12Frame, model_w, model_h):
  cam_w, cam_h, stride, y_height, uv_height, _ = nv12
  uv_offset = stride * y_height
  stride_pad = stride - cam_w
  def frame_prepare_tinygrad(input_frame, M_inv):
    # UV_SCALE @ M_inv @ UV_SCALE_INV simplifies to elementwise scaling
    M_inv_uv = M_inv * Tensor([[1.0, 1.0, 0.5], [1.0, 1.0, 0.5], [2.0, 2.0, 1.0]])
    # deinterleave NV12 UV plane (UVUV... -> separate U, V)
    uv = input_frame[uv_offset:uv_offset + uv_height * stride].reshape(uv_height, stride)
    with Context(SPLIT_REDUCEOP=0):
      y = warp_perspective_tinygrad(input_frame[:cam_h*stride],
                                    M_inv, (model_w, model_h),
                                    (cam_h, cam_w), stride_pad).realize()
      u = warp_perspective_tinygrad(uv[:cam_h//2, :cam_w:2].flatten(),
                                    M_inv_uv, (model_w//2, model_h//2),
                                    (cam_h//2, cam_w//2), 0).realize()
      v = warp_perspective_tinygrad(uv[:cam_h//2, 1:cam_w:2].flatten(),
                                    M_inv_uv, (model_w//2, model_h//2),
                                    (cam_h//2, cam_w//2), 0).realize()
    yuv = y.cat(u).cat(v).reshape((model_h * 3 // 2, model_w))
    tensor = frames_to_tensor(yuv)
    return tensor
  return frame_prepare_tinygrad
 def make_input_queues(vision_input_shapes, policy_input_shapes, frame_skip):
  img = vision_input_shapes['img']  # (1, 12, 128, 256)
  n_frames = img[1] // 6
  img_buf_shape = (frame_skip * (n_frames - 1) + 1, 6, img[2], img[3])
  fb = policy_input_shapes['features_buffer']  # (1, 25, 512)
  dp = policy_input_shapes['desire_pulse']  # (1, 25, 8)
  tc = policy_input_shapes['traffic_convention']  # (1, 2)
  npy = {
    'desire': np.zeros(dp[2], dtype=np.float32),
    'traffic_convention': np.zeros(tc, dtype=np.float32),
    'tfm': np.zeros((3, 3), dtype=np.float32),
    'big_tfm': np.zeros((3, 3), dtype=np.float32),
  }
  input_queues = {
    'img_q': Tensor.zeros(img_buf_shape, dtype='uint8').contiguous().realize(),
    'big_img_q': Tensor.zeros(img_buf_shape, dtype='uint8').contiguous().realize(),
    'feat_q': Tensor.zeros(frame_skip * (fb[1] - 1) + 1, fb[0], fb[2]).contiguous().realize(),
    'desire_q': Tensor.zeros(frame_skip * dp[1], dp[0], dp[2]).contiguous().realize(),
    **{k: Tensor(v, device='NPY').realize() for k, v in npy.items()},
  }
  return input_queues, npy
 def shift_and_sample(buf, new_val, sample_fn):
  buf.assign(buf[1:].cat(new_val, dim=0).contiguous())
  return sample_fn(buf)
 def sample_skip(buf, frame_skip):
  return buf[::frame_skip].contiguous().flatten(0, 1).unsqueeze(0)
 def sample_desire(buf, frame_skip):
  return buf.reshape(-1, frame_skip, *buf.shape[1:]).max(1).flatten(0, 1).unsqueeze(0)
 def make_run_policy(vision_runner, policy_runner, nv12: NV12Frame, model_w, model_h,
                    vision_features_slice, frame_skip, prepare_only=False):
  frame_prepare = make_frame_prepare(nv12, model_w, model_h)
  sample_skip_fn = partial(sample_skip, frame_skip=frame_skip)
  sample_desire_fn = partial(sample_desire, frame_skip=frame_skip)
  def run_policy(img_q, big_img_q, feat_q, desire_q, desire, traffic_convention, tfm, big_tfm, frame, big_frame):
    tfm = tfm.to(Device.DEFAULT)
    big_tfm = big_tfm.to(Device.DEFAULT)
    desire = desire.to(Device.DEFAULT)
    traffic_convention = traffic_convention.to(Device.DEFAULT)
    Tensor.realize(tfm, big_tfm, desire, traffic_convention)
    img = shift_and_sample(img_q, frame_prepare(frame, tfm).unsqueeze(0), sample_skip_fn)
    big_img = shift_and_sample(big_img_q, frame_prepare(big_frame, big_tfm).unsqueeze(0), sample_skip_fn)
    if prepare_only:
      return img, big_img
    vision_out = next(iter(vision_runner({'img': img, 'big_img': big_img}).values())).cast('float32')
    new_feat = vision_out[:, vision_features_slice].reshape(1, -1).unsqueeze(0)
    feat_buf = shift_and_sample(feat_q, new_feat, sample_skip_fn)
    desire_buf = shift_and_sample(desire_q, desire.reshape(1, 1, -1), sample_desire_fn)
    inputs = {'features_buffer': feat_buf, 'desire_pulse': desire_buf, 'traffic_convention': traffic_convention}
    policy_out = next(iter(policy_runner(inputs).values())).cast('float32')
    return vision_out, policy_out
  return run_policy
 def compile_modeld(nv12: NV12Frame, model_w, model_h, prepare_only, frame_skip,
                   vision_onnx, policy_onnx, pkl_path):
  from get_model_metadata import metadata_path_for
  print(f"Compiling combined policy JIT for {nv12.width}x{nv12.height} (prepare_only={prepare_only})...")
  vision_runner = OnnxRunner(vision_onnx)
  policy_runner = OnnxRunner(policy_onnx)
  with open(metadata_path_for(vision_onnx), 'rb') as f:
    vision_metadata = pickle.load(f)
    vision_features_slice = vision_metadata['output_slices']['hidden_state']
    vision_input_shapes = vision_metadata['input_shapes']
  with open(metadata_path_for(policy_onnx), 'rb') as f:
    policy_input_shapes = pickle.load(f)['input_shapes']
  _run = make_run_policy(vision_runner, policy_runner, nv12, model_w, model_h,
                         vision_features_slice, frame_skip, prepare_only)
  run_policy_jit = TinyJit(_run, prune=True)
  N_RUNS = 3
  SEED = 42
  def random_inputs_run_fn(fn, seed, test_val=None, test_buffers=None, expect_match=True):
    input_queues, npy = make_input_queues(vision_input_shapes, policy_input_shapes, frame_skip)
    np.random.seed(seed)
    Tensor.manual_seed(seed)
    for i in range(N_RUNS):
      frame = Tensor.randint(nv12.size, low=0, high=256, dtype='uint8').realize()
      big_frame = Tensor.randint(nv12.size, low=0, high=256, dtype='uint8').realize()
      for v in npy.values():
        v[:] = np.random.randn(*v.shape).astype(v.dtype)
      Device.default.synchronize()
      st = time.perf_counter()
      outs = fn(**input_queues, frame=frame, big_frame=big_frame)
      mt = time.perf_counter()
      for o in outs:
        # .realize() not needed once jitted, but needed for unjitted fn
        o.realize()
      Device.default.synchronize()
      et = time.perf_counter()
      print(f"  [{i+1}/{N_RUNS}] enqueue {(mt-st)*1e3:6.2f} ms -- total {(et-st)*1e3:6.2f} ms")
    val = [np.copy(v.numpy()) for v in outs]
    buffers = [np.copy(v.numpy().copy()) for v in input_queues.values()]
    if test_val is not None:
      match = all(np.array_equal(a, b) for a, b in zip(val, test_val, strict=True))
      assert match == expect_match, f"outputs {'differ from' if expect_match else 'match'} baseline (seed={seed})"
    if test_buffers is not None:
      match = all(np.array_equal(a, b) for a, b in zip(buffers, test_buffers, strict=True))
      assert match == expect_match, f"buffers {'differ from' if expect_match else 'match'} baseline (seed={seed})"
    return fn, val, buffers
  print('run unjitted')
  _, test_val, test_buffers = random_inputs_run_fn(_run, seed=SEED)
  print('capture + replay')
  run_policy_jit, _, _ = random_inputs_run_fn(run_policy_jit, SEED, test_val, test_buffers)
  print('pickle round trip')
  with open(pkl_path, "wb") as f:
    pickle.dump(run_policy_jit, f)
    print(f"  Saved to {pkl_path}")
  with open(pkl_path, "rb") as f:
    run_policy_jit = pickle.load(f)
  random_inputs_run_fn(run_policy_jit, SEED, test_val, test_buffers, expect_match=True)
  random_inputs_run_fn(run_policy_jit, SEED+1, test_val, test_buffers, expect_match=False)
 def _parse_size(s):
  w, h = s.lower().split('x')
  return int(w), int(h)
 def _parse_nv12(s):
  parts = s.split(',')
  assert len(parts) == len(NV12Frame._fields), \
    f"--nv12 expects {','.join(NV12Frame._fields)} (got {s!r})"
  return NV12Frame(*(int(x) for x in parts))
 if __name__ == "__main__":
  p = argparse.ArgumentParser()
  p.add_argument('--model-size', type=_parse_size, required=True, help='model input WxH')
  p.add_argument('--nv12', type=_parse_nv12, required=True,
                 help=f'NV12 frame layout: {",".join(NV12Frame._fields)}')
  p.add_argument('--vision-onnx', required=True)
  p.add_argument('--policy-onnx', required=True)
  p.add_argument('--output', required=True)
  p.add_argument('--prepare-only', action='store_true')
  p.add_argument('--frame-skip', type=int, required=True)
  args = p.parse_args()
  model_w, model_h = args.model_size
  compile_modeld(args.nv12, model_w, model_h, args.prepare_only, args.frame_skip,
                 args.vision_onnx, args.policy_onnx, args.output)
@@ -1,201 +0,0 @@
 #!/usr/bin/env python3
 import time
 import pickle
 import numpy as np
 from pathlib import Path
 from tinygrad.tensor import Tensor
 from tinygrad.helpers import Context
 from tinygrad.device import Device
 from tinygrad.engine.jit import TinyJit
 from openpilot.system.camerad.cameras.nv12_info import get_nv12_info
 from openpilot.common.transformations.model import MEDMODEL_INPUT_SIZE, DM_INPUT_SIZE
 from openpilot.common.transformations.camera import _ar_ox_fisheye, _os_fisheye
 MODELS_DIR = Path(__file__).parent / 'models'
 CAMERA_CONFIGS = [
  (_ar_ox_fisheye.width, _ar_ox_fisheye.height),  # tici: 1928x1208
  (_os_fisheye.width, _os_fisheye.height),        # mici: 1344x760
 ]
 UV_SCALE_MATRIX = np.array([[0.5, 0, 0], [0, 0.5, 0], [0, 0, 1]], dtype=np.float32)
 UV_SCALE_MATRIX_INV = np.linalg.inv(UV_SCALE_MATRIX)
 IMG_BUFFER_SHAPE = (30, MEDMODEL_INPUT_SIZE[1] // 2, MEDMODEL_INPUT_SIZE[0] // 2)
 def warp_pkl_path(w, h):
  return MODELS_DIR / f'warp_{w}x{h}_tinygrad.pkl'
 def dm_warp_pkl_path(w, h):
  return MODELS_DIR / f'dm_warp_{w}x{h}_tinygrad.pkl'
 def warp_perspective_tinygrad(src_flat, M_inv, dst_shape, src_shape, stride_pad):
  w_dst, h_dst = dst_shape
  h_src, w_src = src_shape
  x = Tensor.arange(w_dst).reshape(1, w_dst).expand(h_dst, w_dst).reshape(-1)
  y = Tensor.arange(h_dst).reshape(h_dst, 1).expand(h_dst, w_dst).reshape(-1)
  # inline 3x3 matmul as elementwise to avoid reduce op (enables fusion with gather)
  src_x = M_inv[0, 0] * x + M_inv[0, 1] * y + M_inv[0, 2]
  src_y = M_inv[1, 0] * x + M_inv[1, 1] * y + M_inv[1, 2]
  src_w = M_inv[2, 0] * x + M_inv[2, 1] * y + M_inv[2, 2]
  src_x = src_x / src_w
  src_y = src_y / src_w
  x_nn_clipped = Tensor.round(src_x).clip(0, w_src - 1).cast('int')
  y_nn_clipped = Tensor.round(src_y).clip(0, h_src - 1).cast('int')
  idx = y_nn_clipped * (w_src + stride_pad) + x_nn_clipped
  return src_flat[idx]
 def frames_to_tensor(frames, model_w, model_h):
  H = (frames.shape[0] * 2) // 3
  W = frames.shape[1]
  in_img1 = Tensor.cat(frames[0:H:2, 0::2],
                       frames[1:H:2, 0::2],
                       frames[0:H:2, 1::2],
                       frames[1:H:2, 1::2],
                       frames[H:H+H//4].reshape((H//2, W//2)),
                       frames[H+H//4:H+H//2].reshape((H//2, W//2)), dim=0).reshape((6, H//2, W//2))
  return in_img1
 def make_frame_prepare(cam_w, cam_h, model_w, model_h):
  stride, y_height, uv_height, _ = get_nv12_info(cam_w, cam_h)
  uv_offset = stride * y_height
  stride_pad = stride - cam_w
  def frame_prepare_tinygrad(input_frame, M_inv):
    # UV_SCALE @ M_inv @ UV_SCALE_INV simplifies to elementwise scaling
    M_inv_uv = M_inv * Tensor([[1.0, 1.0, 0.5], [1.0, 1.0, 0.5], [2.0, 2.0, 1.0]])
    # deinterleave NV12 UV plane (UVUV... -> separate U, V)
    uv = input_frame[uv_offset:uv_offset + uv_height * stride].reshape(uv_height, stride)
    with Context(SPLIT_REDUCEOP=0):
      y = warp_perspective_tinygrad(input_frame[:cam_h*stride],
                                    M_inv, (model_w, model_h),
                                    (cam_h, cam_w), stride_pad).realize()
      u = warp_perspective_tinygrad(uv[:cam_h//2, :cam_w:2].flatten(),
                                    M_inv_uv, (model_w//2, model_h//2),
                                    (cam_h//2, cam_w//2), 0).realize()
      v = warp_perspective_tinygrad(uv[:cam_h//2, 1:cam_w:2].flatten(),
                                    M_inv_uv, (model_w//2, model_h//2),
                                    (cam_h//2, cam_w//2), 0).realize()
    yuv = y.cat(u).cat(v).reshape((model_h * 3 // 2, model_w))
    tensor = frames_to_tensor(yuv, model_w, model_h)
    return tensor
  return frame_prepare_tinygrad
 def make_update_img_input(frame_prepare, model_w, model_h):
  def update_img_input_tinygrad(tensor, frame, M_inv):
    M_inv = M_inv.to(Device.DEFAULT)
    new_img = frame_prepare(frame, M_inv)
    tensor.assign(tensor[6:].cat(new_img, dim=0).contiguous())
    return Tensor.cat(tensor[:6], tensor[-6:], dim=0).contiguous().reshape(1, 12, model_h//2, model_w//2)
  return update_img_input_tinygrad
 def make_update_both_imgs(frame_prepare, model_w, model_h):
  update_img = make_update_img_input(frame_prepare, model_w, model_h)
  def update_both_imgs_tinygrad(calib_img_buffer, new_img, M_inv,
                                calib_big_img_buffer, new_big_img, M_inv_big):
    calib_img_pair = update_img(calib_img_buffer, new_img, M_inv)
    calib_big_img_pair = update_img(calib_big_img_buffer, new_big_img, M_inv_big)
    return calib_img_pair, calib_big_img_pair
  return update_both_imgs_tinygrad
 def make_warp_dm(cam_w, cam_h, dm_w, dm_h):
  stride, y_height, _, _ = get_nv12_info(cam_w, cam_h)
  stride_pad = stride - cam_w
  def warp_dm(input_frame, M_inv):
    M_inv = M_inv.to(Device.DEFAULT)
    result = warp_perspective_tinygrad(input_frame[:cam_h*stride], M_inv, (dm_w, dm_h), (cam_h, cam_w), stride_pad).reshape(-1, dm_h * dm_w)
    return result
  return warp_dm
 def compile_modeld_warp(cam_w, cam_h):
  model_w, model_h = MEDMODEL_INPUT_SIZE
  _, _, _, yuv_size = get_nv12_info(cam_w, cam_h)
  print(f"Compiling modeld warp for {cam_w}x{cam_h}...")
  frame_prepare = make_frame_prepare(cam_w, cam_h, model_w, model_h)
  update_both_imgs = make_update_both_imgs(frame_prepare, model_w, model_h)
  update_img_jit = TinyJit(update_both_imgs, prune=True)
  full_buffer = Tensor.zeros(IMG_BUFFER_SHAPE, dtype='uint8').contiguous().realize()
  big_full_buffer = Tensor.zeros(IMG_BUFFER_SHAPE, dtype='uint8').contiguous().realize()
  new_frame_np = np.random.randint(0, 256, yuv_size, dtype=np.uint8)
  new_big_frame_np = np.random.randint(0, 256, yuv_size, dtype=np.uint8)
  for i in range(10):
    img_inputs = [full_buffer,
                  Tensor.from_blob(new_frame_np.ctypes.data, (yuv_size,), dtype='uint8').realize(),
                  Tensor(Tensor.randn(3, 3).mul(8).realize().numpy(), device='NPY')]
    big_img_inputs = [big_full_buffer,
                      Tensor.from_blob(new_big_frame_np.ctypes.data, (yuv_size,), dtype='uint8').realize(),
                      Tensor(Tensor.randn(3, 3).mul(8).realize().numpy(), device='NPY')]
    inputs = img_inputs + big_img_inputs
    Device.default.synchronize()
    st = time.perf_counter()
    _ = update_img_jit(*inputs)
    mt = time.perf_counter()
    Device.default.synchronize()
    et = time.perf_counter()
    print(f"  [{i+1}/10] enqueue {(mt-st)*1e3:6.2f} ms -- total {(et-st)*1e3:6.2f} ms")
  pkl_path = warp_pkl_path(cam_w, cam_h)
  with open(pkl_path, "wb") as f:
    pickle.dump(update_img_jit, f)
  print(f"  Saved to {pkl_path}")
  jit = pickle.load(open(pkl_path, "rb"))
  jit(*inputs)
 def compile_dm_warp(cam_w, cam_h):
  dm_w, dm_h = DM_INPUT_SIZE
  _, _, _, yuv_size = get_nv12_info(cam_w, cam_h)
  print(f"Compiling DM warp for {cam_w}x{cam_h}...")
  warp_dm = make_warp_dm(cam_w, cam_h, dm_w, dm_h)
  warp_dm_jit = TinyJit(warp_dm, prune=True)
  new_frame_np = np.random.randint(0, 256, yuv_size, dtype=np.uint8)
  for i in range(10):
    inputs = [Tensor.from_blob(new_frame_np.ctypes.data, (yuv_size,), dtype='uint8').realize(),
              Tensor(Tensor.randn(3, 3).mul(8).realize().numpy(), device='NPY')]
    Device.default.synchronize()
    st = time.perf_counter()
    warp_dm_jit(*inputs)
    mt = time.perf_counter()
    Device.default.synchronize()
    et = time.perf_counter()
    print(f"  [{i+1}/10] enqueue {(mt-st)*1e3:6.2f} ms -- total {(et-st)*1e3:6.2f} ms")
  pkl_path = dm_warp_pkl_path(cam_w, cam_h)
  with open(pkl_path, "wb") as f:
    pickle.dump(warp_dm_jit, f)
  print(f"  Saved to {pkl_path}")
 def run_and_save_pickle():
  for cam_w, cam_h in CAMERA_CONFIGS:
    compile_modeld_warp(cam_w, cam_h)
    compile_dm_warp(cam_w, cam_h)
 if __name__ == "__main__":
  run_and_save_pickle()
@@ -1,12 +1,8 @@
 #!/usr/bin/env python3
 import os
-from openpilot.selfdrive.modeld.tinygrad_helpers import MODELS_DIR, set_tinygrad_backend_from_compiled_flags
+from openpilot.selfdrive.modeld.helpers import MODELS_DIR, CompileConfig, set_tinygrad_backend_from_compiled_flags
 set_tinygrad_backend_from_compiled_flags()
 # FIXME-SP: remove once we bump tg
 from openpilot.system.hardware import TICI
 os.environ['DEV'] = 'QCOM' if TICI else 'CPU'
 from tinygrad.tensor import Tensor
 import time
 import pickle
@@ -32,7 +28,7 @@ class ModelState:
  inputs: dict[str, np.ndarray]
  output: np.ndarray
-  def __init__(self):
+  def __init__(self, cam_w: int, cam_h: int):
    with open(METADATA_PATH, 'rb') as f:
      model_metadata = pickle.load(f)
      self.input_shapes = model_metadata['input_shapes']
@@ -44,22 +40,18 @@ class ModelState:
    self.warp_inputs_np = {'transform': np.zeros((3,3), dtype=np.float32)}
    self.warp_inputs = {k: Tensor(v, device='NPY') for k,v in self.warp_inputs_np.items()}
-    self.frame_buf_params = None
+    self.frame_buf_params = get_nv12_info(cam_w, cam_h)
    self.tensor_inputs = {k: Tensor(v, device='NPY').realize() for k,v in self.numpy_inputs.items()}
    self._blob_cache : dict[int, Tensor] = {}
    self.image_warp = None
    self.model_run = pickle.loads(read_file_chunked(str(MODEL_PKL_PATH)))
    with open(CompileConfig(cam_w, cam_h, prefix='dm_', prepare_only=True).pkl_path, "rb") as f:
      self.image_warp = pickle.load(f)
  def run(self, buf: VisionBuf, calib: np.ndarray, transform: np.ndarray) -> tuple[np.ndarray, float]:
    self.numpy_inputs['calib'][0,:] = calib
    t1 = time.perf_counter()
    if self.image_warp is None:
      self.frame_buf_params = get_nv12_info(buf.width, buf.height)
      warp_path = MODELS_DIR / f'dm_warp_{buf.width}x{buf.height}_tinygrad.pkl'
      with open(warp_path, "rb") as f:
        self.image_warp = pickle.load(f)
    ptr = buf.data.ctypes.data
    # There is a ringbuffer of imgs, just cache tensors pointing to all of them
    if ptr not in self._blob_cache:
@@ -113,9 +105,6 @@ def get_driverstate_packet(model_output, frame_id: int, location_ts: int, exec_t
 def main():
  config_realtime_process(7, 5)
  model = ModelState()
  cloudlog.warning("models loaded, dmonitoringmodeld starting")
  cloudlog.warning("connecting to driver stream")
  vipc_client = VisionIpcClient("camerad", VisionStreamType.VISION_STREAM_DRIVER, True)
  while not vipc_client.connect(False):
@@ -123,6 +112,9 @@ def main():
  assert vipc_client.is_connected()
  cloudlog.warning(f"connected with buffer size: {vipc_client.buffer_len}")
  model = ModelState(vipc_client.width, vipc_client.height)
  cloudlog.warning("models loaded, dmonitoringmodeld starting")
  sm = SubMaster(["liveCalibration"])
  pm = PubMaster(["driverStateV2"])
@@ -7,6 +7,10 @@ from typing import Any
 from tinygrad.nn.onnx import OnnxPBParser
 def metadata_path_for(onnx_path) -> pathlib.Path:
  p = pathlib.Path(onnx_path)
  return p.parent / (p.stem + '_metadata.pkl')
 class MetadataOnnxPBParser(OnnxPBParser):
  def _parse_ModelProto(self) -> dict:
@@ -48,7 +52,7 @@ if __name__ == "__main__":
    'output_shapes': dict(get_name_and_shape(x) for x in model["graph"]["output"]),
  }
-  metadata_path = model_path.parent / (model_path.stem + '_metadata.pkl')
+  metadata_path = metadata_path_for(model_path)
  with open(metadata_path, 'wb') as f:
    pickle.dump(metadata, f)
@@ -0,0 +1,31 @@
 import json
 import os
 from dataclasses import dataclass
 from pathlib import Path
 from openpilot.system.camerad.cameras.nv12_info import get_nv12_info
 MODELS_DIR = Path(__file__).resolve().parent / 'models'
 COMPILED_FLAGS_PATH = MODELS_DIR / 'tg_compiled_flags.json'
 def set_tinygrad_backend_from_compiled_flags() -> None:
  if os.path.isfile(COMPILED_FLAGS_PATH):
    with open(COMPILED_FLAGS_PATH) as f:
      os.environ['DEV'] = str(json.load(f)['DEV'])
@dataclass
 class CompileConfig:
  cam_w: int
  cam_h: int
  prepare_only: bool
  prefix: str
  @property
  def pkl_path(self):
    return str(MODELS_DIR / f'{self.prefix}{"warp_" if self.prepare_only else ""}{self.cam_w}x{self.cam_h}_tinygrad.pkl')
  @property
  def nv12(self):
    return (self.cam_w, self.cam_h, *get_nv12_info(self.cam_w, self.cam_h))
@@ -1,12 +1,8 @@
 #!/usr/bin/env python3
 import os
-from openpilot.selfdrive.modeld.tinygrad_helpers import MODELS_DIR, set_tinygrad_backend_from_compiled_flags
+from openpilot.selfdrive.modeld.helpers import MODELS_DIR, CompileConfig, set_tinygrad_backend_from_compiled_flags
 set_tinygrad_backend_from_compiled_flags()
 # FIXME-SP: remove once we bump tg
 from openpilot.system.hardware import TICI
 os.environ['DEV'] = 'QCOM' if TICI else 'CPU'
 USBGPU = "USBGPU" in os.environ
 if USBGPU:
  os.environ['DEV'] = 'AMD'
@@ -30,6 +26,7 @@ from openpilot.common.transformations.model import get_warp_matrix
 from openpilot.selfdrive.controls.lib.desire_helper import DesireHelper
 from openpilot.selfdrive.controls.lib.drive_helpers import get_accel_from_plan, smooth_value, get_curvature_from_plan
 from openpilot.selfdrive.modeld.parse_model_outputs import Parser
 from openpilot.selfdrive.modeld.compile_modeld import make_input_queues
 from openpilot.selfdrive.modeld.fill_model_msg import fill_model_msg, fill_pose_msg, PublishState
 from openpilot.common.file_chunker import read_file_chunked
 from openpilot.selfdrive.modeld.constants import ModelConstants, Plan
@@ -41,17 +38,13 @@ from openpilot.sunnypilot.modeld_v2.modeld_base import ModelStateBase
 PROCESS_NAME = "selfdrive.modeld.modeld"
 SEND_RAW_PRED = os.getenv('SEND_RAW_PRED')
 VISION_PKL_PATH = MODELS_DIR / 'driving_vision_tinygrad.pkl'
 VISION_METADATA_PATH = MODELS_DIR / 'driving_vision_metadata.pkl'
 POLICY_PKL_PATH = MODELS_DIR / 'driving_policy_tinygrad.pkl'
 POLICY_METADATA_PATH = MODELS_DIR / 'driving_policy_metadata.pkl'
 LAT_SMOOTH_SECONDS = 0.0
 LONG_SMOOTH_SECONDS = 0.3
 MIN_LAT_CONTROL_SPEED = 0.3
 IMG_QUEUE_SHAPE = (6*(ModelConstants.MODEL_RUN_FREQ//ModelConstants.MODEL_CONTEXT_FREQ + 1), 128, 256)
 assert IMG_QUEUE_SHAPE[0] == 30
 def get_action_from_model(model_output: dict[str, np.ndarray], prev_action: log.ModelDataV2.Action,
@@ -86,108 +79,39 @@ class FrameMeta:
    if vipc is not None:
      self.frame_id, self.timestamp_sof, self.timestamp_eof = vipc.frame_id, vipc.timestamp_sof, vipc.timestamp_eof
 class InputQueues:
  def __init__ (self, model_fps, env_fps, n_frames_input):
    assert env_fps % model_fps == 0
    assert env_fps >= model_fps
    self.model_fps = model_fps
    self.env_fps = env_fps
    self.n_frames_input = n_frames_input
    self.dtypes = {}
    self.shapes = {}
    self.q = {}
  def update_dtypes_and_shapes(self, input_dtypes, input_shapes) -> None:
    self.dtypes.update(input_dtypes)
    if self.env_fps == self.model_fps:
      self.shapes.update(input_shapes)
    else:
      for k in input_shapes:
        shape = list(input_shapes[k])
        if 'img' in k:
          n_channels = shape[1] // self.n_frames_input
          shape[1] = (self.env_fps // self.model_fps + (self.n_frames_input - 1)) * n_channels
        else:
          shape[1] = (self.env_fps // self.model_fps) * shape[1]
        self.shapes[k] = tuple(shape)
  def reset(self) -> None:
    self.q = {k: np.zeros(self.shapes[k], dtype=self.dtypes[k]) for k in self.dtypes.keys()}
  def enqueue(self, inputs:dict[str, np.ndarray]) -> None:
    for k in inputs.keys():
      if inputs[k].dtype != self.dtypes[k]:
        raise ValueError(f'supplied input <{k}({inputs[k].dtype})> has wrong dtype, expected {self.dtypes[k]}')
      input_shape = list(self.shapes[k])
      input_shape[1] = -1
      single_input = inputs[k].reshape(tuple(input_shape))
      sz = single_input.shape[1]
      self.q[k][:,:-sz] = self.q[k][:,sz:]
      self.q[k][:,-sz:] = single_input
  def get(self, *names) -> dict[str, np.ndarray]:
    if self.env_fps == self.model_fps:
      return {k: self.q[k] for k in names}
    else:
      out = {}
      for k in names:
        shape = self.shapes[k]
        if 'img' in k:
          n_channels = shape[1] // (self.env_fps // self.model_fps + (self.n_frames_input - 1))
          out[k] = np.concatenate([self.q[k][:, s:s+n_channels] for s in np.linspace(0, shape[1] - n_channels, self.n_frames_input, dtype=int)], axis=1)
        elif 'pulse' in k:
          # any pulse within interval counts
          out[k] = self.q[k].reshape((shape[0], shape[1] * self.model_fps // self.env_fps, self.env_fps // self.model_fps, -1)).max(axis=2)
        else:
          idxs = np.arange(-1, -shape[1], -self.env_fps // self.model_fps)[::-1]
          out[k] = self.q[k][:, idxs]
      return out
 class ModelState(ModelStateBase):
  inputs: dict[str, np.ndarray]
  output: np.ndarray
  prev_desire: np.ndarray  # for tracking the rising edge of the pulse
-  def __init__(self):
+  def __init__(self, cam_w: int, cam_h: int):
    ModelStateBase.__init__(self)
    self.LAT_SMOOTH_SECONDS = LAT_SMOOTH_SECONDS
    with open(VISION_METADATA_PATH, 'rb') as f:
      vision_metadata = pickle.load(f)
      self.vision_input_shapes =  vision_metadata['input_shapes']
      self.vision_input_names = list(self.vision_input_shapes.keys())
      self.vision_output_slices = vision_metadata['output_slices']
      vision_output_size = vision_metadata['output_shapes']['outputs'][1]
    with open(POLICY_METADATA_PATH, 'rb') as f:
      policy_metadata = pickle.load(f)
      self.policy_input_shapes =  policy_metadata['input_shapes']
      self.policy_output_slices = policy_metadata['output_slices']
      policy_output_size = policy_metadata['output_shapes']['outputs'][1]
    self.prev_desire = np.zeros(ModelConstants.DESIRE_LEN, dtype=np.float32)
-    # policy inputs
+    self.frame_skip = ModelConstants.MODEL_RUN_FREQ // ModelConstants.MODEL_CONTEXT_FREQ
-    self.numpy_inputs = {k: np.zeros(self.policy_input_shapes[k], dtype=np.float32) for k in self.policy_input_shapes}
+    self.input_queues, self.npy = make_input_queues(self.vision_input_shapes, self.policy_input_shapes, self.frame_skip)
    self.full_input_queues = InputQueues(ModelConstants.MODEL_CONTEXT_FREQ, ModelConstants.MODEL_RUN_FREQ, ModelConstants.N_FRAMES)
    for k in ['desire_pulse', 'features_buffer']:
      self.full_input_queues.update_dtypes_and_shapes({k: self.numpy_inputs[k].dtype}, {k: self.numpy_inputs[k].shape})
    self.full_input_queues.reset()
    self.img_queues = {'img': Tensor.zeros(IMG_QUEUE_SHAPE, dtype='uint8').contiguous().realize(),
                       'big_img': Tensor.zeros(IMG_QUEUE_SHAPE, dtype='uint8').contiguous().realize()}
    self.full_frames : dict[str, Tensor] = {}
    self._blob_cache : dict[int, Tensor] = {}
    self.transforms_np = {k: np.zeros((3,3), dtype=np.float32) for k in self.img_queues}
    self.transforms = {k: Tensor(v, device='NPY').realize() for k, v in self.transforms_np.items()}
    self.vision_output = np.zeros(vision_output_size, dtype=np.float32)
    self.policy_inputs = {k: Tensor(v, device='NPY').realize() for k,v in self.numpy_inputs.items()}
    self.policy_output = np.zeros(policy_output_size, dtype=np.float32)
    self.parser = Parser()
-    self.frame_buf_params : dict[str, tuple[int, int, int, int]] = {}
+    self.frame_buf_params = {k: get_nv12_info(cam_w, cam_h) for k in ('img', 'big_img')}
-    self.update_imgs = None
+    self.run_policy = pickle.loads(read_file_chunked(CompileConfig(cam_w, cam_h, prefix='driving_', prepare_only=False).pkl_path))
-    self.vision_run = pickle.loads(read_file_chunked(str(VISION_PKL_PATH)))
+    self.warp_enqueue = pickle.loads(read_file_chunked(CompileConfig(cam_w, cam_h, prefix='driving_', prepare_only=True).pkl_path))
-    self.policy_run = pickle.loads(read_file_chunked(str(POLICY_PKL_PATH)))
+    self.warp_enqueue(
      **self.input_queues,
      frame=Tensor.zeros(self.frame_buf_params['img'][3], dtype='uint8').contiguous().realize(),
      big_frame=Tensor.zeros(self.frame_buf_params['big_img'][3], dtype='uint8').contiguous().realize())
  def slice_outputs(self, model_outputs: np.ndarray, output_slices: dict[str, slice]) -> dict[str, np.ndarray]:
    parsed_model_outputs = {k: model_outputs[np.newaxis, v] for k,v in output_slices.items()}
@@ -195,18 +119,6 @@ class ModelState(ModelStateBase):
  def run(self, bufs: dict[str, VisionBuf], transforms: dict[str, np.ndarray],
                inputs: dict[str, np.ndarray], prepare_only: bool) -> dict[str, np.ndarray] | None:
    # Model decides when action is completed, so desire input is just a pulse triggered on rising edge
    inputs['desire_pulse'][0] = 0
    new_desire = np.where(inputs['desire_pulse'] - self.prev_desire > .99, inputs['desire_pulse'], 0)
    self.prev_desire[:] = inputs['desire_pulse']
    if self.update_imgs is None:
      for key in bufs.keys():
        w, h = bufs[key].width, bufs[key].height
        self.frame_buf_params[key] = get_nv12_info(w, h)
      warp_path = MODELS_DIR / f'warp_{w}x{h}_tinygrad.pkl'
      with open(warp_path, "rb") as f:
        self.update_imgs = pickle.load(f)
    for key in bufs.keys():
      ptr = bufs[key].data.ctypes.data
      yuv_size = self.frame_buf_params[key][3]
@@ -215,30 +127,31 @@ class ModelState(ModelStateBase):
      if cache_key not in self._blob_cache:
        self._blob_cache[cache_key] = Tensor.from_blob(ptr, (yuv_size,), dtype='uint8')
      self.full_frames[key] = self._blob_cache[cache_key]
    for key in bufs.keys():
      self.transforms_np[key][:,:] = transforms[key][:,:]
-    out = self.update_imgs(self.img_queues['img'], self.full_frames['img'], self.transforms['img'],
+    # Model decides when action is completed, so desire input is just a pulse triggered on rising edge
-                           self.img_queues['big_img'], self.full_frames['big_img'], self.transforms['big_img'])
+    inputs['desire_pulse'][0] = 0
-    vision_inputs = {'img': out[0], 'big_img': out[1]}
+    self.npy['desire'][:] = np.where(inputs['desire_pulse'] - self.prev_desire > .99, inputs['desire_pulse'], 0)
    self.prev_desire[:] = inputs['desire_pulse']
    self.npy['traffic_convention'][:] = inputs['traffic_convention']
    self.npy['tfm'][:,:] = transforms['img'][:,:]
    self.npy['big_tfm'][:,:] = transforms['big_img'][:,:]
    if prepare_only:
      self.warp_enqueue(**self.input_queues, frame=self.full_frames['img'], big_frame=self.full_frames['big_img'])
      return None
-    self.vision_output = self.vision_run(**vision_inputs).contiguous().realize().uop.base.buffer.numpy().flatten()
+    vision_output, policy_output = self.run_policy(
-    vision_outputs_dict = self.parser.parse_vision_outputs(self.slice_outputs(self.vision_output, self.vision_output_slices))
+      **self.input_queues, frame=self.full_frames['img'], big_frame=self.full_frames['big_img']
    )
-    self.full_input_queues.enqueue({'features_buffer': vision_outputs_dict['hidden_state'], 'desire_pulse': new_desire})
+    vision_output = vision_output.numpy().flatten()
-    for k in ['desire_pulse', 'features_buffer']:
+    policy_output = policy_output.numpy().flatten()
-      self.numpy_inputs[k][:] = self.full_input_queues.get(k)[k]
+    vision_outputs_dict = self.parser.parse_vision_outputs(self.slice_outputs(vision_output, self.vision_output_slices))
-    self.numpy_inputs['traffic_convention'][:] = inputs['traffic_convention']
+    policy_outputs_dict = self.parser.parse_policy_outputs(self.slice_outputs(policy_output, self.policy_output_slices))
    self.policy_output = self.policy_run(**self.policy_inputs).contiguous().realize().uop.base.buffer.numpy().flatten()
    policy_outputs_dict = self.parser.parse_policy_outputs(self.slice_outputs(self.policy_output, self.policy_output_slices))
    combined_outputs_dict = {**vision_outputs_dict, **policy_outputs_dict}
    if SEND_RAW_PRED:
      combined_outputs_dict['raw_pred'] = np.concatenate([self.vision_output.copy(), self.policy_output.copy()])
    if SEND_RAW_PRED:
      combined_outputs_dict['raw_pred'] = np.concatenate([vision_output.copy(), policy_output.copy()])
    return combined_outputs_dict
@@ -250,11 +163,6 @@ def main(demo=False):
    # also need to move the aux USB interrupts for good timings
    config_realtime_process(7, 54)
  st = time.monotonic()
  cloudlog.warning("loading model")
  model = ModelState()
  cloudlog.warning(f"models loaded in {time.monotonic() - st:.1f}s, modeld starting")
  # visionipc clients
  while True:
    available_streams = VisionIpcClient.available_streams("camerad", block=False)
@@ -278,6 +186,11 @@ def main(demo=False):
  if use_extra_client:
    cloudlog.warning(f"connected extra cam with buffer size: {vipc_client_extra.buffer_len} ({vipc_client_extra.width} x {vipc_client_extra.height})")
  st = time.monotonic()
  cloudlog.warning("loading model")
  model = ModelState(vipc_client_main.width, vipc_client_main.height)
  cloudlog.warning(f"models loaded in {time.monotonic() - st:.1f}s, modeld starting")
  # messaging
  pm = PubMaster(["modelV2", "drivingModelData", "cameraOdometry", "modelDataV2SP"])
  sm = SubMaster(["deviceState", "carState", "roadCameraState", "liveCalibration", "driverMonitoringState", "carControl", "liveDelay"])
@@ -1,12 +0,0 @@
 import json
 import os
 from pathlib import Path
 MODELS_DIR = Path(__file__).parent / 'models'
 COMPILED_FLAGS_PATH = MODELS_DIR / 'tg_compiled_flags.json'
 def set_tinygrad_backend_from_compiled_flags() -> None:
  if os.path.isfile(COMPILED_FLAGS_PATH):
    with open(COMPILED_FLAGS_PATH) as f:
      os.environ['DEV'] = str(json.load(f)['DEV'])
@@ -1,5 +1,6 @@
 import os
 import glob
 from tinygrad import Device
 Import('env', 'arch')
 lenv = env.Clone()
@@ -21,10 +22,19 @@ if PC:
  if outputs:
    lenv.Command(outputs, inputs, cmd)
-tg_flags = {
+available = set(Device.get_available_devices())
-  'larch64': 'DEV=QCOM FLOAT16=1 NOLOCALS=1 JIT_BATCH_SIZE=0',
+if 'CUDA' in available:
-  'Darwin': f'DEV=CPU THREADS=0 HOME={os.path.expanduser("~")}',
+  tg_backend = 'CUDA'
-}.get(arch, 'DEV=CPU CPU_LLVM=1 THREADS=0')
+  tg_flags = f'DEV={tg_backend}'
 elif 'QCOM' in available:
  tg_backend = 'QCOM'
  tg_flags = f'DEV={tg_backend} FLOAT16=1 NOLOCALS=1 JIT_BATCH_SIZE=0'
 else:
  tg_backend = 'CPU' if arch == 'Darwin' else 'CPU:LLVM'
  # THREADS=0 is need to prevent bug: https://github.com/tinygrad/tinygrad/issues/14689
  tg_flags = f'DEV={tg_backend} THREADS=0'
 mac_brew_string = f'HOME={os.path.expanduser("~")}' if arch == 'Darwin' else ''
 image_flag = {
  'larch64': 'IMAGE=2',
@@ -38,7 +48,7 @@ def tg_compile(flags, model_name):
  return lenv.Command(
    out,
    [fn + ".onnx"] + tinygrad_files,
-    f'{pythonpath_string} {flags} {image_flag} python3 {Dir("#tinygrad_repo").abspath}/examples/openpilot/compile3.py {fn}.onnx {out}'
+    f'{pythonpath_string} {tg_flags} {mac_brew_string} {image_flag} python3 {Dir("#tinygrad_repo").abspath}/examples/openpilot/compile3.py {fn}.onnx {out}'
  )
 # Compile models
@@ -46,9 +56,9 @@ for model_name in ['supercombo', 'driving_vision', 'driving_off_policy', 'drivin
  if File(f"models/{model_name}.onnx").exists():
    tg_compile(tg_flags, model_name)
-script_files = [File("warp.py"), File(Dir("#selfdrive/modeld").File("compile_warp.py").abspath)]
+script_files = [File("warp.py")]
 pythonpath_string = 'PYTHONPATH="${PYTHONPATH}:' + env.Dir("#tinygrad_repo").abspath + ':' + env.Dir("#").abspath + '"'
-compile_warp_cmd = f'{pythonpath_string} {tg_flags} python3 -m sunnypilot.modeld_v2.warp'
+compile_warp_cmd = f'{pythonpath_string} {tg_flags} {mac_brew_string} {image_flag} python3 -m sunnypilot.modeld_v2.warp'
 from openpilot.common.transformations.camera import _ar_ox_fisheye, _os_fisheye
 warp_targets = []
@@ -129,8 +129,7 @@ class ModelState(ModelStateBase):
        self.numpy_inputs[key][:] = inputs[key]
    imgs_tensors = self.warp.process(bufs, transforms)
-    for name, tensor in imgs_tensors.items():
+    self.model_runner.update_vision_inputs(imgs_tensors)
      self.model_runner.inputs[name] = tensor
    self.model_runner.prepare_inputs(self.numpy_inputs)
    if prepare_only:
@@ -2,8 +2,11 @@ import os
 os.environ['DEV'] = 'CPU'
 import pytest
 import numpy as np
-from openpilot.selfdrive.modeld.compile_warp import get_nv12_info, CAMERA_CONFIGS
+from openpilot.sunnypilot.modeld_v2.warp import CAMERA_CONFIGS
-from openpilot.sunnypilot.modeld_v2.warp import Warp, MODEL_W, MODEL_H
+from openpilot.system.camerad.cameras.nv12_info import get_nv12_info
 from openpilot.sunnypilot.modeld_v2.warp import Warp
 from openpilot.common.transformations.model import MEDMODEL_INPUT_SIZE
 MODEL_W, MODEL_H = MEDMODEL_INPUT_SIZE
 VISION_NAME_PAIRS = [  # needed to account for supercombos input_imgs
  ('img', 'big_img'),
@@ -6,29 +6,128 @@ from tinygrad.tensor import Tensor
 from tinygrad.engine.jit import TinyJit
 from tinygrad.device import Device
 from typing import NamedTuple
 # https://github.com/tinygrad/tinygrad/issues/15682
 from tinygrad.uop.ops import UOp, Ops
 _orig = UOp.__reduce__
 UOp.__reduce__ = lambda self: (UOp.unique, ()) if self.op is Ops.UNIQUE else _orig(self)
 from tinygrad.helpers import Context
 from openpilot.system.camerad.cameras.nv12_info import get_nv12_info
-from openpilot.selfdrive.modeld.compile_warp import (
+from openpilot.common.transformations.camera import _ar_ox_fisheye, _os_fisheye
-  CAMERA_CONFIGS, MEDMODEL_INPUT_SIZE, make_frame_prepare, make_update_both_imgs,
+
-  warp_pkl_path,
+class NV12Frame(NamedTuple):
-)
+  cam_w: int
  cam_h: int
  stride: int
  y_height: int
  uv_height: int
  size: int
 UV_SCALE_MATRIX = np.array([[0.5, 0, 0], [0, 0.5, 0], [0, 0, 1]], dtype=np.float32)
 UV_SCALE_MATRIX_INV = np.linalg.inv(UV_SCALE_MATRIX)
 CAMERA_CONFIGS = [
  (_ar_ox_fisheye.width, _ar_ox_fisheye.height),  # tici: 1928x1208
  (_os_fisheye.width, _os_fisheye.height),        # mici: 1344x760
 ]
 from openpilot.common.transformations.model import MEDMODEL_INPUT_SIZE
 MODELS_DIR = Path(__file__).parent / 'models'
-MODEL_W, MODEL_H = MEDMODEL_INPUT_SIZE
+
 UPSTREAM_BUFFER_LENGTH = 5
 def warp_pkl_path(cam_w, cam_h):
  return MODELS_DIR / f'warp_{cam_w}x{cam_h}_tinygrad.pkl'
 def warp_perspective_tinygrad(src_flat, M_inv, dst_shape, src_shape, stride_pad):
  w_dst, h_dst = dst_shape
  h_src, w_src = src_shape
  x = Tensor.arange(w_dst).reshape(1, w_dst).expand(h_dst, w_dst).reshape(-1)
  y = Tensor.arange(h_dst).reshape(h_dst, 1).expand(h_dst, w_dst).reshape(-1)
  # inline 3x3 matmul as elementwise to avoid reduce op (enables fusion with gather)
  src_x = M_inv[0, 0] * x + M_inv[0, 1] * y + M_inv[0, 2]
  src_y = M_inv[1, 0] * x + M_inv[1, 1] * y + M_inv[1, 2]
  src_w = M_inv[2, 0] * x + M_inv[2, 1] * y + M_inv[2, 2]
  src_x = src_x / src_w
  src_y = src_y / src_w
  x_nn_clipped = Tensor.round(src_x).clip(0, w_src - 1).cast('int')
  y_nn_clipped = Tensor.round(src_y).clip(0, h_src - 1).cast('int')
  idx = y_nn_clipped * (w_src + stride_pad) + x_nn_clipped
  return src_flat[idx]
 def frames_to_tensor(frames, model_w, model_h):
  H = (frames.shape[0] * 2) // 3
  W = frames.shape[1]
  in_img1 = Tensor.cat(frames[0:H:2, 0::2],
                       frames[1:H:2, 0::2],
                       frames[0:H:2, 1::2],
                       frames[1:H:2, 1::2],
                       frames[H:H+H//4].reshape((H//2, W//2)),
                       frames[H+H//4:H+H//2].reshape((H//2, W//2)), dim=0).reshape((6, H//2, W//2))
  return in_img1
 def make_frame_prepare(cam_w, cam_h, model_w, model_h):
  stride, y_height, uv_height, _ = get_nv12_info(cam_w, cam_h)
  uv_offset = stride * y_height
  stride_pad = stride - cam_w
  def frame_prepare_tinygrad(input_frame, M_inv):
    # UV_SCALE @ M_inv @ UV_SCALE_INV simplifies to elementwise scaling
    M_inv_uv = M_inv * Tensor([[1.0, 1.0, 0.5], [1.0, 1.0, 0.5], [2.0, 2.0, 1.0]])
    # deinterleave NV12 UV plane (UVUV... -> separate U, V)
    uv = input_frame[uv_offset:uv_offset + uv_height * stride].reshape(uv_height, stride)
    with Context(SPLIT_REDUCEOP=0):
      y = warp_perspective_tinygrad(input_frame[:cam_h*stride],
                                    M_inv, (model_w, model_h),
                                    (cam_h, cam_w), stride_pad).realize()
      u = warp_perspective_tinygrad(uv[:cam_h//2, :cam_w:2].flatten(),
                                    M_inv_uv, (model_w//2, model_h//2),
                                    (cam_h//2, cam_w//2), 0).realize()
      v = warp_perspective_tinygrad(uv[:cam_h//2, 1:cam_w:2].flatten(),
                                    M_inv_uv, (model_w//2, model_h//2),
                                    (cam_h//2, cam_w//2), 0).realize()
    yuv = y.cat(u).cat(v).reshape((model_h * 3 // 2, model_w))
    tensor = frames_to_tensor(yuv, model_w, model_h)
    return tensor
  return frame_prepare_tinygrad
 def make_update_img_input(frame_prepare, model_w, model_h):
  def update_img_input_tinygrad(tensor, frame, M_inv):
    M_inv = M_inv.to(Device.DEFAULT)
    new_img = frame_prepare(frame, M_inv)
    tensor.assign(tensor[6:].cat(new_img, dim=0).contiguous())
    return tensor, Tensor.cat(tensor[:6], tensor[-6:], dim=0).contiguous().reshape(1, 12, model_h//2, model_w//2)
  return update_img_input_tinygrad
 def make_update_both_imgs(frame_prepare, model_w, model_h):
  update_img = make_update_img_input(frame_prepare, model_w, model_h)
  def update_both_imgs_tinygrad(calib_img_buffer, new_img, M_inv,
                                calib_big_img_buffer, new_big_img, M_inv_big):
    r1, r2 = update_img(calib_img_buffer, new_img, M_inv)
    w1, w2 = update_img(calib_big_img_buffer, new_big_img, M_inv_big)
    return r1, r2, w1, w2
  return update_both_imgs_tinygrad
 def v2_warp_pkl_path(cam_w, cam_h, buffer_length):
  return MODELS_DIR / f'warp_{cam_w}x{cam_h}_b{buffer_length}_tinygrad.pkl'
-def compile_v2_warp(cam_w, cam_h, buffer_length):
+def compile_v2_warp(cam_w, cam_h, buffer_length, model_w=MEDMODEL_INPUT_SIZE[0], model_h=MEDMODEL_INPUT_SIZE[1], pkl_path=None):
  _, _, _, yuv_size = get_nv12_info(cam_w, cam_h)
-  img_buffer_shape = (buffer_length * 6, MODEL_H // 2, MODEL_W // 2)
+  img_buffer_shape = (buffer_length * 6, model_h // 2, model_w // 2)
  print(f"Compiling v2 warp for {cam_w}x{cam_h} buffer_length={buffer_length}...")
-  frame_prepare = make_frame_prepare(cam_w, cam_h, MODEL_W, MODEL_H)
+  frame_prepare = make_frame_prepare(cam_w, cam_h, model_w, model_h)
-  update_both_imgs = make_update_both_imgs(frame_prepare, MODEL_W, MODEL_H)
+  update_both_imgs = make_update_both_imgs(frame_prepare, model_w, model_h)
  update_img_jit = TinyJit(update_both_imgs, prune=True)
  full_buffer = Tensor.zeros(img_buffer_shape, dtype='uint8').contiguous().realize()
@@ -46,25 +145,25 @@ def compile_v2_warp(cam_w, cam_h, buffer_length):
    Device.default.synchronize()
    st = time.perf_counter()
-    _ = update_img_jit(*inputs)
+    update_img_jit(*inputs)
    mt = time.perf_counter()
    Device.default.synchronize()
    et = time.perf_counter()
    print(f"  [{i+1}/10] enqueue {(mt-st)*1e3:6.2f} ms -- total {(et-st)*1e3:6.2f} ms")
-  pkl_path = v2_warp_pkl_path(cam_w, cam_h, buffer_length)
+  if pkl_path is None:
    pkl_path = v2_warp_pkl_path(cam_w, cam_h, buffer_length)
  with open(pkl_path, "wb") as f:
    pickle.dump(update_img_jit, f)
  print(f"  Saved to {pkl_path}")
  jit = pickle.load(open(pkl_path, "rb"))
  jit(*inputs)
 class Warp:
-  def __init__(self, buffer_length=2):
+  def __init__(self, buffer_length=2, model_w=MEDMODEL_INPUT_SIZE[0], model_h=MEDMODEL_INPUT_SIZE[1]):
    self.buffer_length = buffer_length
-    self.img_buffer_shape = (buffer_length * 6, MODEL_H // 2, MODEL_W // 2)
+    self.model_w = model_w
    self.model_h = model_h
    self.img_buffer_shape = (buffer_length * 6, model_h // 2, model_w // 2)
    self.jit_cache = {}
    self.full_buffers = {k: Tensor.zeros(self.img_buffer_shape, dtype='uint8').contiguous().realize() for k in ['img', 'big_img']}
@@ -92,8 +191,8 @@ class Warp:
          with open(upstream_pkl, 'rb') as f:
            self.jit_cache[key] = pickle.load(f)
      if key not in self.jit_cache:
-        frame_prepare = make_frame_prepare(cam_w, cam_h, MODEL_W, MODEL_H)
+        frame_prepare = make_frame_prepare(cam_w, cam_h, self.model_w, self.model_h)
-        update_both_imgs = make_update_both_imgs(frame_prepare, MODEL_W, MODEL_H)
+        update_both_imgs = make_update_both_imgs(frame_prepare, self.model_w, self.model_h)
        self.jit_cache[key] = TinyJit(update_both_imgs, prune=True)
    if key not in self._nv12_cache:
@@ -107,7 +206,7 @@ class Warp:
    if wide_ptr not in self._blob_cache:
      self._blob_cache[wide_ptr] = Tensor.from_blob(wide_ptr, (yuv_size,), dtype='uint8')
    road_blob = self._blob_cache[road_ptr]
-    wide_blob = self._blob_cache[wide_ptr] if wide_ptr != road_ptr else Tensor.from_blob(wide_ptr, (yuv_size,), dtype='uint8')
+    wide_blob = self._blob_cache[wide_ptr]
    np.copyto(self.transforms_np['img'], transforms[road].reshape(3, 3))
    np.copyto(self.transforms_np['big_img'], transforms[wide].reshape(3, 3))
@@ -116,13 +215,11 @@ class Warp:
      self.full_buffers['img'], road_blob, self.transforms['img'],
      self.full_buffers['big_img'], wide_blob, self.transforms['big_img'],
    )
-    out_road = res[0].realize()
+    return {road: res[1].realize(), wide: res[3].realize()}
    out_wide = res[1].realize()
    return {road: out_road, wide: out_wide}
 if __name__ == "__main__":
  for cam_w, cam_h in CAMERA_CONFIGS:
    compile_v2_warp(cam_w, cam_h, 5, pkl_path=warp_pkl_path(cam_w, cam_h))
    for bl in [2, 5]:
      compile_v2_warp(cam_w, cam_h, bl)
@@ -116,7 +116,7 @@ class ModelCache:
 class ModelFetcher:
  """Handles fetching and caching of model data from remote source"""
-  MODEL_URL = "https://raw.githubusercontent.com/sunnypilot/sunnypilot-models/refs/heads/gh-pages/docs/driving_models_v16.json"
+  MODEL_URL = "https://raw.githubusercontent.com/sunnypilot/sunnypilot-models/refs/heads/gh-pages/docs/driving_models_v18.json"
  def __init__(self, params: Params):
    self.params = params
@@ -132,6 +132,11 @@ class ModelRunner(ModularRunner):
      return list(self._model_data.input_shapes.keys())
    raise ValueError("Model data is not available. Ensure the model is loaded correctly.")
  def update_vision_inputs(self, vision_inputs: dict) -> None:
    """Updates the vision inputs in the runner."""
    for name, tensor in vision_inputs.items():
      self.inputs[name] = tensor
  @abstractmethod
  def prepare_inputs(self, numpy_inputs: NumpyDict) -> dict:
    """
@@ -46,14 +46,13 @@ class TinygradRunner(ModelRunner, SupercomboTinygrad, PolicyTinygrad, VisionTiny
        assert "/dev/kgsl-3d0" not in str(e), "Model was built on C3 or C3X, but is being loaded on PC"
        raise
    # Map input names to their required dtype and device from the loaded model
    self.input_to_dtype = {}
    self.input_to_device = {}
    for idx, name in enumerate(self.model_run.captured.expected_names):
      info = self.model_run.captured.expected_input_info[idx]
-      self.input_to_dtype[name] = info[2]  # dtype
+      self.input_to_dtype[name] = info[2]
-      self.input_to_device[name] = info[3]  # device
+      self.input_to_device[name] = info[3]
-    self._policy_cached = False
+      self.inputs[name] = Tensor.zeros(*self.input_shapes[name], dtype=info[2], device=info[3]).realize()
  @property
  def vision_input_names(self) -> list[str]:
@@ -62,22 +61,23 @@ class TinygradRunner(ModelRunner, SupercomboTinygrad, PolicyTinygrad, VisionTiny
  def prepare_policy_inputs(self, numpy_inputs: NumpyDict):
-    if not self._policy_cached:
+    for key, value in numpy_inputs.items():
-      for key, value in numpy_inputs.items():
+      if key in self.inputs:
-        self.inputs[key] = Tensor(value, device='NPY').realize()
+        self.inputs[key].assign(Tensor(value, device=self.inputs[key].device))
      self._policy_cached = True
  def prepare_inputs(self, numpy_inputs: NumpyDict) -> dict:
    """Prepares all vision and policy inputs for the model."""
    self.prepare_policy_inputs(numpy_inputs)
    for key in self.vision_input_names:
      if key in self.inputs:
        self.inputs[key] = self.inputs[key].cast(self.input_to_dtype[key])
    return self.inputs
  def update_vision_inputs(self, vision_inputs: dict[str, Tensor]):
    for name, tensor in vision_inputs.items():
      if name in self.inputs:
        self.inputs[name].assign(tensor)
  def _run_model(self) -> NumpyDict:
    """Runs the Tinygrad model inference and parses the outputs."""
-    outputs = self.model_run(**self.inputs).contiguous().realize().uop.base.buffer.numpy().flatten()
+    outputs = self.model_run(**self.inputs).numpy().flatten()
    return self._parse_outputs(outputs)
  def _parse_outputs(self, model_outputs: np.ndarray) -> NumpyDict:
Author	SHA1	Message	Date
Jason Wen	6654e9cdf9	Merge branch 'master' into tinygrad-sync-4/25	2026-05-17 21:20:04 -04:00
discountchubbs	6523084bfc	commit	2026-05-03 11:48:11 -07:00
James Vecellio-Grant	94737c523d	Merge branch 'master' into tinygrad-sync-4/25	2026-04-28 11:34:56 -07:00
discountchubbs	46fd88376e	big or small brain?	2026-04-28 11:34:04 -07:00
discountchubbs	3ac95a7475	no HACKS	2026-04-27 20:27:54 -07:00
discountchubbs	4cc84c5680	no hacks	2026-04-27 20:20:50 -07:00
discountchubbs	0768b2408c	try this	2026-04-27 11:02:37 -07:00
discountchubbs	402f3c8966	legacy	2026-04-27 09:29:15 -07:00
discountchubbs	ae44e4d998	fetch	2026-04-27 07:59:20 -07:00
discountchubbs	ccf40652b6	correct tg	2026-04-26 20:01:21 -07:00
discountchubbs	271ed5e091	dm	2026-04-26 13:03:04 -07:00
discountchubbs	41dea5d48d	Revert "sync dmonitoring too" This reverts commit `dc11e5fd84`.	2026-04-26 13:02:31 -07:00
discountchubbs	dc11e5fd84	sync dmonitoring too	2026-04-26 12:53:45 -07:00
discountchubbs	ced4a664cc	use upstreams	2026-04-26 11:18:51 -07:00
discountchubbs	03db277c22	dev	2026-04-26 11:15:34 -07:00
discountchubbs	11ed3800bf	ugh	2026-04-26 11:10:17 -07:00
discountchubbs	92526b878c	json v17	2026-04-26 11:07:17 -07:00
discountchubbs	66ff8ae52c	shebang	2026-04-26 00:33:45 -07:00
discountchubbs	d85cb76304	lint	2026-04-26 00:32:31 -07:00
James Vecellio-Grant	b4c613680e	Update SConscript	2026-04-26 00:29:30 -07:00
discountchubbs	f7511491f7	sync new modeld changes	2026-04-26 00:27:47 -07:00
discountchubbs	88b30e199b	fix to new tg	2026-04-25 23:47:26 -07:00
discountchubbs	2898f394dd	bump	2026-04-25 23:38:31 -07:00
discountchubbs	554cf9ca4a	modeld: sync tinygrad	2026-04-25 23:28:02 -07:00