torquey

use upstreams
dev
2026-07-25 03:12:09 +08:00 · 2026-04-26 12:02:45 -07:00 · 2026-04-26 11:18:51 -07:00 · 2026-04-26 11:15:34 -07:00 · 2026-04-26 11:10:17 -07:00 · 2026-04-26 11:07:17 -07:00
16 changed files with 535 additions and 391 deletions
@@ -172,8 +172,8 @@ jobs:
            output_file="${{ env.MODELS_DIR }}/${base_name}_tinygrad.pkl"

            echo "Compiling: $onnx_file -> $output_file"
-            QCOM=1 python3 "${{ env.TINYGRAD_PATH }}/examples/openpilot/compile3.py" "$onnx_file" "$output_file"
-            DEV=QCOM FLOAT16=1 NOLOCALS=1 JIT_BATCH_SIZE=0 python3 "${{ env.MODELS_DIR }}/../get_model_metadata.py" "$onnx_file" || true
+            DEV=QCOM FLOAT16=1 NOLOCALS=1 JIT_BATCH_SIZE=0 OPENPILOT_HACKS=1 IMAGE=2 python3 "${{ env.TINYGRAD_PATH }}/examples/openpilot/compile3.py" "$onnx_file" "$output_file"
+            DEV=QCOM FLOAT16=1 NOLOCALS=1 JIT_BATCH_SIZE=0 OPENPILOT_HACKS=1 python3 "${{ env.MODELS_DIR }}/../get_model_metadata.py" "$onnx_file" || true
          done

      - name: Validate Model Outputs
@@ -22,15 +22,19 @@ from openpilot.sunnypilot.selfdrive.controls.lib.latcontrol_torque_ext import La
 # Additionally, there is friction in the steering wheel that needs
 # to be overcome to move it at all, this is compensated for too.

-KP = 0.8
-KI = 0.15
-
 INTERP_SPEEDS = [1, 1.5, 2.0, 3.0, 5, 7.5, 10, 15, 30]
-KP_INTERP = [250, 120, 65, 30, 11.5, 5.5, 3.5, 2.0, KP]
+KP_INTERP = [150.0, 100.0, 30.0, 6.0, 2.0, 1.5, 1.2, 0.8, 0.5]
+KI_INTERP = [0.06, 0.06, 0.06, 0.08, 0.08, 0.10, 0.12, 0.14, 0.16]
+JERK_INTERP = [0.25, 0.25, 0.25, 0.23, 0.20, 0.18, 0.16, 0.15, 0.15]
+MEAS_FILTER_TAU_INTERP = [0.07, 0.07, 0.07, 0.07, 0.07, 0.06, 0.06, 0.05, 0.01]
+INTEGRATOR_DECAY_INTERP = [0.990, 0.990, 0.990, 0.992, 0.993, 0.995, 0.996, 0.998, 0.999]
+INTEGRATOR_DECAY_FRAMES = 20
+STRAIGHT_DAMP_THRESHOLD_INTERP = [0.03, 0.03, 0.03, 0.03, 0.03, 0.05, 0.05, 0.2, 0.2]
+STRAIGHT_DAMP_MIN_INTERP = [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]
+STRAIGHT_DAMP_TAU = 0.5

 LP_FILTER_CUTOFF_HZ = 1.2
-JERK_LOOKAHEAD_SECONDS = 0.19
-JERK_GAIN = 0.3
+JERK_LOOKAHEAD_SECONDS = 0.30
 LAT_ACCEL_REQUEST_BUFFER_SECONDS = 1.0
 VERSION = 1

@@ -40,13 +44,16 @@ class LatControlTorque(LatControl):
    self.torque_params = CP.lateralTuning.torque.as_builder()
    self.torque_from_lateral_accel = CI.torque_from_lateral_accel()
    self.lateral_accel_from_torque = CI.lateral_accel_from_torque()
-    self.pid = PIDController([INTERP_SPEEDS, KP_INTERP], KI, rate=1/self.dt)
+    self.pid = PIDController([INTERP_SPEEDS, KP_INTERP], [INTERP_SPEEDS, KI_INTERP], rate=1/self.dt)
    self.update_limits()
    self.steering_angle_deadzone_deg = self.torque_params.steeringAngleDeadzoneDeg
    self.lat_accel_request_buffer_len = int(LAT_ACCEL_REQUEST_BUFFER_SECONDS / self.dt)
    self.lat_accel_request_buffer = deque([0.] * self.lat_accel_request_buffer_len , maxlen=self.lat_accel_request_buffer_len)
    self.lookahead_frames = int(JERK_LOOKAHEAD_SECONDS / self.dt)
    self.jerk_filter = FirstOrderFilter(0.0, 1 / (2 * np.pi * LP_FILTER_CUTOFF_HZ), self.dt)
+    self.measurement_filter = FirstOrderFilter(0.0, 0.04, self.dt)
+    self.integrator_decay_counter = 0
+    self.straight_damp_filter = FirstOrderFilter(1.0, STRAIGHT_DAMP_TAU, self.dt)

    self.extension = LatControlTorqueExt(self, CP, CP_SP, CI)

@@ -68,7 +75,9 @@ class LatControlTorque(LatControl):
    pid_log = log.ControlsState.LateralTorqueState.new_message()
    pid_log.version = VERSION
    measured_curvature = -VM.calc_curvature(math.radians(CS.steeringAngleDeg - params.angleOffsetDeg), CS.vEgo, params.roll)
-    measurement = measured_curvature * CS.vEgo ** 2
+    meas_tau = float(np.interp(CS.vEgo, INTERP_SPEEDS, MEAS_FILTER_TAU_INTERP))
+    self.measurement_filter.update_alpha(meas_tau)
+    measurement = self.measurement_filter.update(measured_curvature * CS.vEgo ** 2)
    future_desired_lateral_accel = desired_curvature * CS.vEgo ** 2
    self.lat_accel_request_buffer.append(future_desired_lateral_accel)

@@ -79,7 +88,12 @@ class LatControlTorque(LatControl):
    delay_frames = int(np.clip(lat_delay / self.dt + 1, 1, self.lat_accel_request_buffer_len))
    expected_lateral_accel = self.lat_accel_request_buffer[-delay_frames]
    setpoint = expected_lateral_accel
-    error = setpoint - measurement
+    raw_error = setpoint - measurement
+    damp_min = float(np.interp(CS.vEgo, INTERP_SPEEDS, STRAIGHT_DAMP_MIN_INTERP))
+    damp_threshold = float(np.interp(CS.vEgo, INTERP_SPEEDS, STRAIGHT_DAMP_THRESHOLD_INTERP))
+    damp_target = max(damp_min, min(1.0, abs(setpoint) / damp_threshold))
+    damp_factor = self.straight_damp_filter.update(damp_target)
+    error = raw_error * damp_factor

    lookahead_idx = int(np.clip(-delay_frames + self.lookahead_frames, -self.lat_accel_request_buffer_len+1, -2))
    raw_lateral_jerk = (self.lat_accel_request_buffer[lookahead_idx+1] - self.lat_accel_request_buffer[lookahead_idx-1]) / (2 * self.dt)
@@ -88,7 +102,8 @@ class LatControlTorque(LatControl):
    ff = gravity_adjusted_future_lateral_accel
    # latAccelOffset corrects roll compensation bias from device roll misalignment relative to car roll
    ff -= self.torque_params.latAccelOffset
-    ff += get_friction(error + JERK_GAIN * desired_lateral_jerk, lateral_accel_deadzone, FRICTION_THRESHOLD, self.torque_params)
+    jerk_gain = float(np.interp(CS.vEgo, INTERP_SPEEDS, JERK_INTERP))
+    ff += get_friction(error + jerk_gain * desired_lateral_jerk, lateral_accel_deadzone, FRICTION_THRESHOLD, self.torque_params)

    if not active:
      output_torque = 0.0
@@ -99,6 +114,15 @@ class LatControlTorque(LatControl):

      freeze_integrator = steer_limited_by_safety or CS.steeringPressed or CS.vEgo < 5
      output_lataccel = self.pid.update(pid_log.error, speed=CS.vEgo, feedforward=ff, freeze_integrator=freeze_integrator)
+
+      error_opposes_integrator = (self.pid.i > 0 and pid_log.error < 0) or (self.pid.i < 0 and pid_log.error > 0)
+      if error_opposes_integrator:
+        self.integrator_decay_counter = min(self.integrator_decay_counter + 1, INTEGRATOR_DECAY_FRAMES + 10)
+      else:
+        self.integrator_decay_counter = 0
+      if self.integrator_decay_counter >= INTEGRATOR_DECAY_FRAMES:
+        self.pid.i *= float(np.interp(CS.vEgo, INTERP_SPEEDS, INTEGRATOR_DECAY_INTERP))
+
      output_torque = self.torque_from_lateral_accel(output_lataccel, self.torque_params)

      # Lateral acceleration torque controller extension updates
@@ -32,12 +32,12 @@ COST_E_DIM = 5
 COST_DIM = COST_E_DIM + 1
 CONSTR_DIM = 4

-X_EGO_OBSTACLE_COST = 3.
+X_EGO_OBSTACLE_COST = 2.
 X_EGO_COST = 0.
 V_EGO_COST = 0.
 A_EGO_COST = 0.
 J_EGO_COST = 5.
-A_CHANGE_COST = 200.
+A_CHANGE_COST = 350.
 DANGER_ZONE_COST = 100.
 CRASH_DISTANCE = .25
 LEAD_DANGER_FACTOR = 0.75
@@ -53,7 +53,7 @@ T_IDXS_LST = [index_function(idx, max_val=MAX_T, max_idx=N) for idx in range(N+1
 T_IDXS = np.array(T_IDXS_LST)
 FCW_IDXS = T_IDXS < 5.0
 T_DIFFS = np.diff(T_IDXS, prepend=[0.])
-COMFORT_BRAKE = 2.5
+COMFORT_BRAKE = 2.
 STOP_DISTANCE = 6.0
 CRUISE_MIN_ACCEL = -1.2
 CRUISE_MAX_ACCEL = 1.6
@@ -1,10 +1,23 @@
 import glob
 import json
 import os
+from itertools import product
 from SCons.Script import Value
 from openpilot.common.file_chunker import chunk_file, get_chunk_paths
+from openpilot.common.transformations.camera import _ar_ox_fisheye, _os_fisheye
+from openpilot.common.transformations.model import MEDMODEL_INPUT_SIZE, DM_INPUT_SIZE
+from openpilot.selfdrive.modeld.constants import ModelConstants
+from openpilot.selfdrive.modeld.helpers import CompileConfig
 from tinygrad import Device

+CAMERA_CONFIGS = [
+  (_ar_ox_fisheye.width, _ar_ox_fisheye.height),  # tici: 1928x1208
+  (_os_fisheye.width, _os_fisheye.height),        # mici: 1344x760
+]
+MODELD_CONFIGS = [CompileConfig(cam_w, cam_h, prepare_only, 'driving_')
+                  for (cam_w, cam_h), prepare_only in product(CAMERA_CONFIGS, [True, False])]
+DM_WARP_CONFIGS = [CompileConfig(cam_w, cam_h, True, 'dm_') for cam_w, cam_h in CAMERA_CONFIGS]
+
 Import('env', 'arch')
 chunker_file = File("#common/file_chunker.py")
 lenv = env.Clone()
@@ -16,18 +29,17 @@ tinygrad_files = ["#"+x for x in glob.glob(env.Dir("#tinygrad_repo").relpath + "
 def estimate_pickle_max_size(onnx_size):
  return 1.2 * onnx_size  + 10 * 1024 * 1024  # 20% + 10MB is plenty

-# THREADS=0 is need to prevent bug: https://github.com/tinygrad/tinygrad/issues/14689
 # get fastest TG config
 available = set(Device.get_available_devices())
-# FIXME-SP: reset when we bump tg
-if False:  # 'CUDA' in available:
+if 'CUDA' in available:
  tg_backend = 'CUDA'
  tg_flags = f'DEV={tg_backend}'
 elif 'QCOM' in available:
  tg_backend = 'QCOM'
-  tg_flags = f'DEV={tg_backend} FLOAT16=1 NOLOCALS=1 JIT_BATCH_SIZE=0'
+  tg_flags = f'DEV={tg_backend} FLOAT16=1 NOLOCALS=1 JIT_BATCH_SIZE=0 OPENPILOT_HACKS=1'
 else:
-  tg_backend = 'CPU' if arch == 'Darwin' else 'CPU CPU_LLVM=1'  # FIXME-SP: reset when we bump tg
+  tg_backend = 'CPU' if arch == 'Darwin' else 'CPU:LLVM'
+  # THREADS=0 is need to prevent bug: https://github.com/tinygrad/tinygrad/issues/14689
  tg_flags = f'DEV={tg_backend} THREADS=0'

 def write_tg_compiled_flags(target, source, env):
@@ -54,14 +66,35 @@ for model_name in ['driving_vision', 'driving_policy', 'dmonitoring_model']:
 image_flag = {
     'larch64': 'IMAGE=2',
 }.get(arch, 'IMAGE=0')
-script_files = [File(Dir("#selfdrive/modeld").File("compile_warp.py").abspath)]
-compile_warp_cmd = f'{tg_flags} {mac_brew_string} python3 {Dir("#selfdrive/modeld").abspath}/compile_warp.py '
-from openpilot.common.transformations.camera import _ar_ox_fisheye, _os_fisheye
-warp_targets = []
-for cam in [_ar_ox_fisheye, _os_fisheye]:
-  w, h = cam.width, cam.height
-  warp_targets += [File(f"models/warp_{w}x{h}_tinygrad.pkl").abspath, File(f"models/dm_warp_{w}x{h}_tinygrad.pkl").abspath]
-lenv.Command(warp_targets, tinygrad_files + script_files + [compiled_flags_node], compile_warp_cmd)
+modeld_dir = Dir("#selfdrive/modeld").abspath
+compile_modeld_script = [File(f"{modeld_dir}/compile_modeld.py")]
+compile_dm_warp_script = [File(f"{modeld_dir}/compile_dm_warp.py")]
+driving_onnx_deps = [File(f"models/{m}.onnx").abspath for m in ['driving_vision', 'driving_policy']]
+driving_metadata_deps = [File(f"models/{m}_metadata.pkl").abspath for m in ['driving_vision', 'driving_policy']]
+
+model_w, model_h = MEDMODEL_INPUT_SIZE
+frame_skip = ModelConstants.MODEL_RUN_FREQ // ModelConstants.MODEL_CONTEXT_FREQ
+for cfg in MODELD_CONFIGS:
+  cmd = (f'{tg_flags} {mac_brew_string} {image_flag} python3 {modeld_dir}/compile_modeld.py '
+         f'--model-size {model_w}x{model_h} '
+         f'--nv12 {",".join(str(x) for x in cfg.nv12)} '
+         f'--vision-onnx {File("models/driving_vision.onnx").abspath} '
+         f'--policy-onnx {File("models/driving_policy.onnx").abspath} '
+         f'--output {cfg.pkl_path} --frame-skip {frame_skip}'
+         + (' --prepare-only' if cfg.prepare_only else ''))
+  node = lenv.Command(cfg.pkl_path, tinygrad_files + compile_modeld_script + driving_onnx_deps + driving_metadata_deps + [chunker_file, compiled_flags_node], cmd)
+  onnx_sizes_sum = sum(os.path.getsize(f) for f in driving_onnx_deps)
+  chunk_targets = get_chunk_paths(cfg.pkl_path, estimate_pickle_max_size(onnx_sizes_sum))
+  def do_chunk(target, source, env, pkl=cfg.pkl_path, chunks=chunk_targets):
+    chunk_file(pkl, chunks)
+  lenv.Command(chunk_targets, node, do_chunk)
+
+dm_w, dm_h = DM_INPUT_SIZE
+for cfg in DM_WARP_CONFIGS:
+  cmd = (f'{tg_flags} {mac_brew_string} {image_flag} python3 {modeld_dir}/compile_dm_warp.py '
+         f'--nv12 {",".join(str(x) for x in cfg.nv12)} --warp-to {dm_w}x{dm_h} '
+         f'--output {cfg.pkl_path}')
+  lenv.Command(cfg.pkl_path, tinygrad_files + compile_dm_warp_script + compile_modeld_script + [compiled_flags_node], cmd)

 def tg_compile(flags, model_name):
  pythonpath_string = 'PYTHONPATH="${PYTHONPATH}:' + env.Dir("#tinygrad_repo").abspath + '"'
@@ -82,7 +115,4 @@ def tg_compile(flags, model_name):
    do_chunk,
  )

-# Compile small models
-for model_name in ['driving_vision', 'driving_policy', 'dmonitoring_model']:
-  tg_compile(tg_flags, model_name)
-
+tg_compile(tg_flags, 'dmonitoring_model')
@@ -0,0 +1,54 @@
+#!/usr/bin/env python3
+import argparse
+import pickle
+import time
+
+from tinygrad.tensor import Tensor
+from tinygrad.device import Device
+from tinygrad.engine.jit import TinyJit
+
+from openpilot.selfdrive.modeld.compile_modeld import NV12Frame, warp_perspective_tinygrad, _parse_size, _parse_nv12
+
+
+def make_warp_dm(nv12: NV12Frame, dm_w, dm_h):
+  cam_w, cam_h, stride, _, _, _ = nv12
+  stride_pad = stride - cam_w
+
+  def warp_dm(input_frame, M_inv):
+    M_inv = M_inv.to(Device.DEFAULT).realize()
+    return warp_perspective_tinygrad(input_frame[:cam_h*stride], M_inv,
+                                     (dm_w, dm_h), (cam_h, cam_w), stride_pad).reshape(-1, dm_h * dm_w)
+  return warp_dm
+
+
+def compile_dm_warp(nv12: NV12Frame, dm_w, dm_h, pkl_path):
+  print(f"Compiling DM warp for {nv12.width}x{nv12.height} -> {dm_w}x{dm_h}...")
+
+  warp_dm_jit = TinyJit(make_warp_dm(nv12, dm_w, dm_h), prune=True)
+
+  for i in range(10):
+    frame = Tensor.randint(nv12.size, low=0, high=256, dtype='uint8').realize()
+    M_inv = Tensor(Tensor.randn(3, 3).mul(8).realize().numpy(), device='NPY')
+    Device.default.synchronize()
+    st = time.perf_counter()
+    warp_dm_jit(frame, M_inv).realize()
+    mt = time.perf_counter()
+    Device.default.synchronize()
+    et = time.perf_counter()
+    print(f"  [{i+1}/10] enqueue {(mt-st)*1e3:6.2f} ms -- total {(et-st)*1e3:6.2f} ms")
+
+  with open(pkl_path, "wb") as f:
+    pickle.dump(warp_dm_jit, f)
+  print(f"  Saved to {pkl_path}")
+
+
+if __name__ == "__main__":
+  p = argparse.ArgumentParser()
+  p.add_argument('--nv12', type=_parse_nv12, required=True,
+                 help=f'NV12 frame layout: {",".join(NV12Frame._fields)}')
+  p.add_argument('--warp-to', type=_parse_size, required=True, help='DM input WxH')
+  p.add_argument('--output', required=True)
+  args = p.parse_args()
+
+  dm_w, dm_h = args.warp_to
+  compile_dm_warp(args.nv12, dm_w, dm_h, args.output)
@@ -0,0 +1,253 @@
+#!/usr/bin/env python3
+import argparse
+import pickle
+import time
+from functools import partial
+from collections import namedtuple
+
+import numpy as np
+from tinygrad.tensor import Tensor
+from tinygrad.helpers import Context
+from tinygrad.device import Device
+from tinygrad.engine.jit import TinyJit
+from tinygrad.nn.onnx import OnnxRunner
+
+# https://github.com/tinygrad/tinygrad/issues/15682
+from tinygrad.uop.ops import UOp, Ops
+_orig = UOp.__reduce__
+UOp.__reduce__ = lambda self: (UOp.unique, ()) if self.op is Ops.UNIQUE else _orig(self)
+
+
+NV12Frame = namedtuple("NV12Frame", ['width', 'height', 'stride', 'y_height', 'uv_height', 'size'])
+
+UV_SCALE_MATRIX = np.array([[0.5, 0, 0], [0, 0.5, 0], [0, 0, 1]], dtype=np.float32)
+UV_SCALE_MATRIX_INV = np.linalg.inv(UV_SCALE_MATRIX)
+
+
+def warp_perspective_tinygrad(src_flat, M_inv, dst_shape, src_shape, stride_pad):
+  w_dst, h_dst = dst_shape
+  h_src, w_src = src_shape
+
+  x = Tensor.arange(w_dst).reshape(1, w_dst).expand(h_dst, w_dst).reshape(-1)
+  y = Tensor.arange(h_dst).reshape(h_dst, 1).expand(h_dst, w_dst).reshape(-1)
+
+  # inline 3x3 matmul as elementwise to avoid reduce op (enables fusion with gather)
+  src_x = M_inv[0, 0] * x + M_inv[0, 1] * y + M_inv[0, 2]
+  src_y = M_inv[1, 0] * x + M_inv[1, 1] * y + M_inv[1, 2]
+  src_w = M_inv[2, 0] * x + M_inv[2, 1] * y + M_inv[2, 2]
+
+  src_x = src_x / src_w
+  src_y = src_y / src_w
+
+  x_nn_clipped = Tensor.round(src_x).clip(0, w_src - 1).cast('int')
+  y_nn_clipped = Tensor.round(src_y).clip(0, h_src - 1).cast('int')
+  idx = y_nn_clipped * (w_src + stride_pad) + x_nn_clipped
+
+  return src_flat[idx]
+
+
+def frames_to_tensor(frames):
+  H = (frames.shape[0] * 2) // 3
+  W = frames.shape[1]
+  in_img1 = Tensor.cat(frames[0:H:2, 0::2],
+                       frames[1:H:2, 0::2],
+                       frames[0:H:2, 1::2],
+                       frames[1:H:2, 1::2],
+                       frames[H:H+H//4].reshape((H//2, W//2)),
+                       frames[H+H//4:H+H//2].reshape((H//2, W//2)), dim=0).reshape((6, H//2, W//2))
+  return in_img1
+
+
+def make_frame_prepare(nv12: NV12Frame, model_w, model_h):
+  cam_w, cam_h, stride, y_height, uv_height, _ = nv12
+  uv_offset = stride * y_height
+  stride_pad = stride - cam_w
+
+  def frame_prepare_tinygrad(input_frame, M_inv):
+    # UV_SCALE @ M_inv @ UV_SCALE_INV simplifies to elementwise scaling
+    M_inv_uv = M_inv * Tensor([[1.0, 1.0, 0.5], [1.0, 1.0, 0.5], [2.0, 2.0, 1.0]])
+    # deinterleave NV12 UV plane (UVUV... -> separate U, V)
+    uv = input_frame[uv_offset:uv_offset + uv_height * stride].reshape(uv_height, stride)
+    with Context(SPLIT_REDUCEOP=0):
+      y = warp_perspective_tinygrad(input_frame[:cam_h*stride],
+                                    M_inv, (model_w, model_h),
+                                    (cam_h, cam_w), stride_pad).realize()
+      u = warp_perspective_tinygrad(uv[:cam_h//2, :cam_w:2].flatten(),
+                                    M_inv_uv, (model_w//2, model_h//2),
+                                    (cam_h//2, cam_w//2), 0).realize()
+      v = warp_perspective_tinygrad(uv[:cam_h//2, 1:cam_w:2].flatten(),
+                                    M_inv_uv, (model_w//2, model_h//2),
+                                    (cam_h//2, cam_w//2), 0).realize()
+    yuv = y.cat(u).cat(v).reshape((model_h * 3 // 2, model_w))
+    tensor = frames_to_tensor(yuv)
+    return tensor
+  return frame_prepare_tinygrad
+
+
+def make_input_queues(vision_input_shapes, policy_input_shapes, frame_skip):
+  img = vision_input_shapes['img']  # (1, 12, 128, 256)
+  n_frames = img[1] // 6
+  img_buf_shape = (frame_skip * (n_frames - 1) + 1, 6, img[2], img[3])
+
+  fb = policy_input_shapes['features_buffer']  # (1, 25, 512)
+  dp = policy_input_shapes['desire_pulse']  # (1, 25, 8)
+  tc = policy_input_shapes['traffic_convention']  # (1, 2)
+
+  npy = {
+    'desire': np.zeros(dp[2], dtype=np.float32),
+    'traffic_convention': np.zeros(tc, dtype=np.float32),
+    'tfm': np.zeros((3, 3), dtype=np.float32),
+    'big_tfm': np.zeros((3, 3), dtype=np.float32),
+  }
+  input_queues = {
+    'img_q': Tensor.zeros(img_buf_shape, dtype='uint8').contiguous().realize(),
+    'big_img_q': Tensor.zeros(img_buf_shape, dtype='uint8').contiguous().realize(),
+    'feat_q': Tensor.zeros(frame_skip * (fb[1] - 1) + 1, fb[0], fb[2]).contiguous().realize(),
+    'desire_q': Tensor.zeros(frame_skip * dp[1], dp[0], dp[2]).contiguous().realize(),
+    **{k: Tensor(v, device='NPY').realize() for k, v in npy.items()},
+  }
+  return input_queues, npy
+
+
+def shift_and_sample(buf, new_val, sample_fn):
+  buf.assign(buf[1:].cat(new_val, dim=0).contiguous())
+  return sample_fn(buf)
+
+
+def sample_skip(buf, frame_skip):
+  return buf[::frame_skip].contiguous().flatten(0, 1).unsqueeze(0)
+
+
+def sample_desire(buf, frame_skip):
+  return buf.reshape(-1, frame_skip, *buf.shape[1:]).max(1).flatten(0, 1).unsqueeze(0)
+
+
+def make_run_policy(vision_runner, policy_runner, nv12: NV12Frame, model_w, model_h,
+                    vision_features_slice, frame_skip, prepare_only=False):
+  frame_prepare = make_frame_prepare(nv12, model_w, model_h)
+  sample_skip_fn = partial(sample_skip, frame_skip=frame_skip)
+  sample_desire_fn = partial(sample_desire, frame_skip=frame_skip)
+
+  def run_policy(img_q, big_img_q, feat_q, desire_q, desire, traffic_convention, tfm, big_tfm, frame, big_frame):
+    tfm = tfm.to(Device.DEFAULT)
+    big_tfm = big_tfm.to(Device.DEFAULT)
+    desire = desire.to(Device.DEFAULT)
+    traffic_convention = traffic_convention.to(Device.DEFAULT)
+    Tensor.realize(tfm, big_tfm, desire, traffic_convention)
+
+    img = shift_and_sample(img_q, frame_prepare(frame, tfm).unsqueeze(0), sample_skip_fn)
+    big_img = shift_and_sample(big_img_q, frame_prepare(big_frame, big_tfm).unsqueeze(0), sample_skip_fn)
+
+    if prepare_only:
+      return img, big_img
+
+    vision_out = next(iter(vision_runner({'img': img, 'big_img': big_img}).values())).cast('float32')
+
+    new_feat = vision_out[:, vision_features_slice].reshape(1, -1).unsqueeze(0)
+    feat_buf = shift_and_sample(feat_q, new_feat, sample_skip_fn)
+    desire_buf = shift_and_sample(desire_q, desire.reshape(1, 1, -1), sample_desire_fn)
+
+    inputs = {'features_buffer': feat_buf, 'desire_pulse': desire_buf, 'traffic_convention': traffic_convention}
+    policy_out = next(iter(policy_runner(inputs).values())).cast('float32')
+
+    return vision_out, policy_out
+  return run_policy
+
+
+def compile_modeld(nv12: NV12Frame, model_w, model_h, prepare_only, frame_skip,
+                   vision_onnx, policy_onnx, pkl_path):
+  from get_model_metadata import metadata_path_for
+
+  print(f"Compiling combined policy JIT for {nv12.width}x{nv12.height} (prepare_only={prepare_only})...")
+
+  vision_runner = OnnxRunner(vision_onnx)
+  policy_runner = OnnxRunner(policy_onnx)
+
+  with open(metadata_path_for(vision_onnx), 'rb') as f:
+    vision_metadata = pickle.load(f)
+    vision_features_slice = vision_metadata['output_slices']['hidden_state']
+    vision_input_shapes = vision_metadata['input_shapes']
+  with open(metadata_path_for(policy_onnx), 'rb') as f:
+    policy_input_shapes = pickle.load(f)['input_shapes']
+
+  _run = make_run_policy(vision_runner, policy_runner, nv12, model_w, model_h,
+                         vision_features_slice, frame_skip, prepare_only)
+  run_policy_jit = TinyJit(_run, prune=True)
+
+  N_RUNS = 3
+  SEED = 42
+
+  def random_inputs_run_fn(fn, seed, test_val=None, test_buffers=None, expect_match=True):
+    input_queues, npy = make_input_queues(vision_input_shapes, policy_input_shapes, frame_skip)
+    np.random.seed(seed)
+    Tensor.manual_seed(seed)
+
+    for i in range(N_RUNS):
+      frame = Tensor.randint(nv12.size, low=0, high=256, dtype='uint8').realize()
+      big_frame = Tensor.randint(nv12.size, low=0, high=256, dtype='uint8').realize()
+      for v in npy.values():
+        v[:] = np.random.randn(*v.shape).astype(v.dtype)
+      Device.default.synchronize()
+      st = time.perf_counter()
+      outs = fn(**input_queues, frame=frame, big_frame=big_frame)
+      mt = time.perf_counter()
+      for o in outs:
+        # .realize() not needed once jitted, but needed for unjitted fn
+        o.realize()
+      Device.default.synchronize()
+      et = time.perf_counter()
+      print(f"  [{i+1}/{N_RUNS}] enqueue {(mt-st)*1e3:6.2f} ms -- total {(et-st)*1e3:6.2f} ms")
+
+    val = [np.copy(v.numpy()) for v in outs]
+    buffers = [np.copy(v.numpy().copy()) for v in input_queues.values()]
+
+    if test_val is not None:
+      match = all(np.array_equal(a, b) for a, b in zip(val, test_val, strict=True))
+      assert match == expect_match, f"outputs {'differ from' if expect_match else 'match'} baseline (seed={seed})"
+    if test_buffers is not None:
+      match = all(np.array_equal(a, b) for a, b in zip(buffers, test_buffers, strict=True))
+      assert match == expect_match, f"buffers {'differ from' if expect_match else 'match'} baseline (seed={seed})"
+    return fn, val, buffers
+
+  print('run unjitted')
+  _, test_val, test_buffers = random_inputs_run_fn(_run, seed=SEED)
+  print('capture + replay')
+  run_policy_jit, _, _ = random_inputs_run_fn(run_policy_jit, SEED, test_val, test_buffers)
+
+  print('pickle round trip')
+  with open(pkl_path, "wb") as f:
+    pickle.dump(run_policy_jit, f)
+    print(f"  Saved to {pkl_path}")
+  with open(pkl_path, "rb") as f:
+    run_policy_jit = pickle.load(f)
+  random_inputs_run_fn(run_policy_jit, SEED, test_val, test_buffers, expect_match=True)
+  random_inputs_run_fn(run_policy_jit, SEED+1, test_val, test_buffers, expect_match=False)
+
+
+def _parse_size(s):
+  w, h = s.lower().split('x')
+  return int(w), int(h)
+
+
+def _parse_nv12(s):
+  parts = s.split(',')
+  assert len(parts) == len(NV12Frame._fields), \
+    f"--nv12 expects {','.join(NV12Frame._fields)} (got {s!r})"
+  return NV12Frame(*(int(x) for x in parts))
+
+
+if __name__ == "__main__":
+  p = argparse.ArgumentParser()
+  p.add_argument('--model-size', type=_parse_size, required=True, help='model input WxH')
+  p.add_argument('--nv12', type=_parse_nv12, required=True,
+                 help=f'NV12 frame layout: {",".join(NV12Frame._fields)}')
+  p.add_argument('--vision-onnx', required=True)
+  p.add_argument('--policy-onnx', required=True)
+  p.add_argument('--output', required=True)
+  p.add_argument('--prepare-only', action='store_true')
+  p.add_argument('--frame-skip', type=int, required=True)
+  args = p.parse_args()
+
+  model_w, model_h = args.model_size
+  compile_modeld(args.nv12, model_w, model_h, args.prepare_only, args.frame_skip,
+                 args.vision_onnx, args.policy_onnx, args.output)
@@ -1,201 +0,0 @@
-#!/usr/bin/env python3
-import time
-import pickle
-import numpy as np
-from pathlib import Path
-from tinygrad.tensor import Tensor
-from tinygrad.helpers import Context
-from tinygrad.device import Device
-from tinygrad.engine.jit import TinyJit
-
-from openpilot.system.camerad.cameras.nv12_info import get_nv12_info
-from openpilot.common.transformations.model import MEDMODEL_INPUT_SIZE, DM_INPUT_SIZE
-from openpilot.common.transformations.camera import _ar_ox_fisheye, _os_fisheye
-
-MODELS_DIR = Path(__file__).parent / 'models'
-
-CAMERA_CONFIGS = [
-  (_ar_ox_fisheye.width, _ar_ox_fisheye.height),  # tici: 1928x1208
-  (_os_fisheye.width, _os_fisheye.height),        # mici: 1344x760
-]
-
-UV_SCALE_MATRIX = np.array([[0.5, 0, 0], [0, 0.5, 0], [0, 0, 1]], dtype=np.float32)
-UV_SCALE_MATRIX_INV = np.linalg.inv(UV_SCALE_MATRIX)
-
-IMG_BUFFER_SHAPE = (30, MEDMODEL_INPUT_SIZE[1] // 2, MEDMODEL_INPUT_SIZE[0] // 2)
-
-
-def warp_pkl_path(w, h):
-  return MODELS_DIR / f'warp_{w}x{h}_tinygrad.pkl'
-
-
-def dm_warp_pkl_path(w, h):
-  return MODELS_DIR / f'dm_warp_{w}x{h}_tinygrad.pkl'
-
-
-def warp_perspective_tinygrad(src_flat, M_inv, dst_shape, src_shape, stride_pad):
-  w_dst, h_dst = dst_shape
-  h_src, w_src = src_shape
-
-  x = Tensor.arange(w_dst).reshape(1, w_dst).expand(h_dst, w_dst).reshape(-1)
-  y = Tensor.arange(h_dst).reshape(h_dst, 1).expand(h_dst, w_dst).reshape(-1)
-
-  # inline 3x3 matmul as elementwise to avoid reduce op (enables fusion with gather)
-  src_x = M_inv[0, 0] * x + M_inv[0, 1] * y + M_inv[0, 2]
-  src_y = M_inv[1, 0] * x + M_inv[1, 1] * y + M_inv[1, 2]
-  src_w = M_inv[2, 0] * x + M_inv[2, 1] * y + M_inv[2, 2]
-
-  src_x = src_x / src_w
-  src_y = src_y / src_w
-
-  x_nn_clipped = Tensor.round(src_x).clip(0, w_src - 1).cast('int')
-  y_nn_clipped = Tensor.round(src_y).clip(0, h_src - 1).cast('int')
-  idx = y_nn_clipped * (w_src + stride_pad) + x_nn_clipped
-
-  return src_flat[idx]
-
-
-def frames_to_tensor(frames, model_w, model_h):
-  H = (frames.shape[0] * 2) // 3
-  W = frames.shape[1]
-  in_img1 = Tensor.cat(frames[0:H:2, 0::2],
-                       frames[1:H:2, 0::2],
-                       frames[0:H:2, 1::2],
-                       frames[1:H:2, 1::2],
-                       frames[H:H+H//4].reshape((H//2, W//2)),
-                       frames[H+H//4:H+H//2].reshape((H//2, W//2)), dim=0).reshape((6, H//2, W//2))
-  return in_img1
-
-
-def make_frame_prepare(cam_w, cam_h, model_w, model_h):
-  stride, y_height, uv_height, _ = get_nv12_info(cam_w, cam_h)
-  uv_offset = stride * y_height
-  stride_pad = stride - cam_w
-
-  def frame_prepare_tinygrad(input_frame, M_inv):
-    # UV_SCALE @ M_inv @ UV_SCALE_INV simplifies to elementwise scaling
-    M_inv_uv = M_inv * Tensor([[1.0, 1.0, 0.5], [1.0, 1.0, 0.5], [2.0, 2.0, 1.0]])
-    # deinterleave NV12 UV plane (UVUV... -> separate U, V)
-    uv = input_frame[uv_offset:uv_offset + uv_height * stride].reshape(uv_height, stride)
-    with Context(SPLIT_REDUCEOP=0):
-      y = warp_perspective_tinygrad(input_frame[:cam_h*stride],
-                                    M_inv, (model_w, model_h),
-                                    (cam_h, cam_w), stride_pad).realize()
-      u = warp_perspective_tinygrad(uv[:cam_h//2, :cam_w:2].flatten(),
-                                    M_inv_uv, (model_w//2, model_h//2),
-                                    (cam_h//2, cam_w//2), 0).realize()
-      v = warp_perspective_tinygrad(uv[:cam_h//2, 1:cam_w:2].flatten(),
-                                    M_inv_uv, (model_w//2, model_h//2),
-                                    (cam_h//2, cam_w//2), 0).realize()
-    yuv = y.cat(u).cat(v).reshape((model_h * 3 // 2, model_w))
-    tensor = frames_to_tensor(yuv, model_w, model_h)
-    return tensor
-  return frame_prepare_tinygrad
-
-
-def make_update_img_input(frame_prepare, model_w, model_h):
-  def update_img_input_tinygrad(tensor, frame, M_inv):
-    M_inv = M_inv.to(Device.DEFAULT)
-    new_img = frame_prepare(frame, M_inv)
-    tensor.assign(tensor[6:].cat(new_img, dim=0).contiguous())
-    return Tensor.cat(tensor[:6], tensor[-6:], dim=0).contiguous().reshape(1, 12, model_h//2, model_w//2)
-  return update_img_input_tinygrad
-
-
-def make_update_both_imgs(frame_prepare, model_w, model_h):
-  update_img = make_update_img_input(frame_prepare, model_w, model_h)
-
-  def update_both_imgs_tinygrad(calib_img_buffer, new_img, M_inv,
-                                calib_big_img_buffer, new_big_img, M_inv_big):
-    calib_img_pair = update_img(calib_img_buffer, new_img, M_inv)
-    calib_big_img_pair = update_img(calib_big_img_buffer, new_big_img, M_inv_big)
-    return calib_img_pair, calib_big_img_pair
-  return update_both_imgs_tinygrad
-
-
-def make_warp_dm(cam_w, cam_h, dm_w, dm_h):
-  stride, y_height, _, _ = get_nv12_info(cam_w, cam_h)
-  stride_pad = stride - cam_w
-
-  def warp_dm(input_frame, M_inv):
-    M_inv = M_inv.to(Device.DEFAULT)
-    result = warp_perspective_tinygrad(input_frame[:cam_h*stride], M_inv, (dm_w, dm_h), (cam_h, cam_w), stride_pad).reshape(-1, dm_h * dm_w)
-    return result
-  return warp_dm
-
-
-def compile_modeld_warp(cam_w, cam_h):
-  model_w, model_h = MEDMODEL_INPUT_SIZE
-  _, _, _, yuv_size = get_nv12_info(cam_w, cam_h)
-
-  print(f"Compiling modeld warp for {cam_w}x{cam_h}...")
-
-  frame_prepare = make_frame_prepare(cam_w, cam_h, model_w, model_h)
-  update_both_imgs = make_update_both_imgs(frame_prepare, model_w, model_h)
-  update_img_jit = TinyJit(update_both_imgs, prune=True)
-
-  full_buffer = Tensor.zeros(IMG_BUFFER_SHAPE, dtype='uint8').contiguous().realize()
-  big_full_buffer = Tensor.zeros(IMG_BUFFER_SHAPE, dtype='uint8').contiguous().realize()
-  new_frame_np = np.random.randint(0, 256, yuv_size, dtype=np.uint8)
-  new_big_frame_np = np.random.randint(0, 256, yuv_size, dtype=np.uint8)
-  for i in range(10):
-    img_inputs = [full_buffer,
-                  Tensor.from_blob(new_frame_np.ctypes.data, (yuv_size,), dtype='uint8').realize(),
-                  Tensor(Tensor.randn(3, 3).mul(8).realize().numpy(), device='NPY')]
-    big_img_inputs = [big_full_buffer,
-                      Tensor.from_blob(new_big_frame_np.ctypes.data, (yuv_size,), dtype='uint8').realize(),
-                      Tensor(Tensor.randn(3, 3).mul(8).realize().numpy(), device='NPY')]
-    inputs = img_inputs + big_img_inputs
-    Device.default.synchronize()
-
-    st = time.perf_counter()
-    _ = update_img_jit(*inputs)
-    mt = time.perf_counter()
-    Device.default.synchronize()
-    et = time.perf_counter()
-    print(f"  [{i+1}/10] enqueue {(mt-st)*1e3:6.2f} ms -- total {(et-st)*1e3:6.2f} ms")
-
-  pkl_path = warp_pkl_path(cam_w, cam_h)
-  with open(pkl_path, "wb") as f:
-    pickle.dump(update_img_jit, f)
-  print(f"  Saved to {pkl_path}")
-
-  jit = pickle.load(open(pkl_path, "rb"))
-  jit(*inputs)
-
-
-def compile_dm_warp(cam_w, cam_h):
-  dm_w, dm_h = DM_INPUT_SIZE
-  _, _, _, yuv_size = get_nv12_info(cam_w, cam_h)
-
-  print(f"Compiling DM warp for {cam_w}x{cam_h}...")
-
-  warp_dm = make_warp_dm(cam_w, cam_h, dm_w, dm_h)
-  warp_dm_jit = TinyJit(warp_dm, prune=True)
-
-  new_frame_np = np.random.randint(0, 256, yuv_size, dtype=np.uint8)
-  for i in range(10):
-    inputs = [Tensor.from_blob(new_frame_np.ctypes.data, (yuv_size,), dtype='uint8').realize(),
-              Tensor(Tensor.randn(3, 3).mul(8).realize().numpy(), device='NPY')]
-    Device.default.synchronize()
-    st = time.perf_counter()
-    warp_dm_jit(*inputs)
-    mt = time.perf_counter()
-    Device.default.synchronize()
-    et = time.perf_counter()
-    print(f"  [{i+1}/10] enqueue {(mt-st)*1e3:6.2f} ms -- total {(et-st)*1e3:6.2f} ms")
-
-  pkl_path = dm_warp_pkl_path(cam_w, cam_h)
-  with open(pkl_path, "wb") as f:
-    pickle.dump(warp_dm_jit, f)
-  print(f"  Saved to {pkl_path}")
-
-
-def run_and_save_pickle():
-  for cam_w, cam_h in CAMERA_CONFIGS:
-    compile_modeld_warp(cam_w, cam_h)
-    compile_dm_warp(cam_w, cam_h)
-
-
-if __name__ == "__main__":
-  run_and_save_pickle()
@@ -7,6 +7,10 @@ from typing import Any

 from tinygrad.nn.onnx import OnnxPBParser

+def metadata_path_for(onnx_path) -> pathlib.Path:
+  p = pathlib.Path(onnx_path)
+  return p.parent / (p.stem + '_metadata.pkl')
+

 class MetadataOnnxPBParser(OnnxPBParser):
  def _parse_ModelProto(self) -> dict:
@@ -48,7 +52,7 @@ if __name__ == "__main__":
    'output_shapes': dict(get_name_and_shape(x) for x in model["graph"]["output"]),
  }

-  metadata_path = model_path.parent / (model_path.stem + '_metadata.pkl')
+  metadata_path = metadata_path_for(model_path)
  with open(metadata_path, 'wb') as f:
    pickle.dump(metadata, f)

@@ -0,0 +1,31 @@
+import json
+import os
+from dataclasses import dataclass
+from pathlib import Path
+
+from openpilot.system.camerad.cameras.nv12_info import get_nv12_info
+
+MODELS_DIR = Path(__file__).resolve().parent / 'models'
+COMPILED_FLAGS_PATH = MODELS_DIR / 'tg_compiled_flags.json'
+
+
+def set_tinygrad_backend_from_compiled_flags() -> None:
+  if os.path.isfile(COMPILED_FLAGS_PATH):
+    with open(COMPILED_FLAGS_PATH) as f:
+      os.environ['DEV'] = str(json.load(f)['DEV'])
+
+
+@dataclass
+class CompileConfig:
+  cam_w: int
+  cam_h: int
+  prepare_only: bool
+  prefix: str
+
+  @property
+  def pkl_path(self):
+    return str(MODELS_DIR / f'{self.prefix}{"warp_" if self.prepare_only else ""}{self.cam_w}x{self.cam_h}_tinygrad.pkl')
+
+  @property
+  def nv12(self):
+    return (self.cam_w, self.cam_h, *get_nv12_info(self.cam_w, self.cam_h))
@@ -1,12 +1,8 @@
 #!/usr/bin/env python3
 import os
-from openpilot.selfdrive.modeld.tinygrad_helpers import MODELS_DIR, set_tinygrad_backend_from_compiled_flags
+from openpilot.selfdrive.modeld.helpers import MODELS_DIR, CompileConfig, set_tinygrad_backend_from_compiled_flags
 set_tinygrad_backend_from_compiled_flags()

-# FIXME-SP: remove once we bump tg
-from openpilot.system.hardware import TICI
-os.environ['DEV'] = 'QCOM' if TICI else 'CPU'
-
 USBGPU = "USBGPU" in os.environ
 if USBGPU:
  os.environ['DEV'] = 'AMD'
@@ -30,6 +26,7 @@ from openpilot.common.transformations.model import get_warp_matrix
 from openpilot.selfdrive.controls.lib.desire_helper import DesireHelper
 from openpilot.selfdrive.controls.lib.drive_helpers import get_accel_from_plan, smooth_value, get_curvature_from_plan
 from openpilot.selfdrive.modeld.parse_model_outputs import Parser
+from openpilot.selfdrive.modeld.compile_modeld import make_input_queues
 from openpilot.selfdrive.modeld.fill_model_msg import fill_model_msg, fill_pose_msg, PublishState
 from openpilot.common.file_chunker import read_file_chunked
 from openpilot.selfdrive.modeld.constants import ModelConstants, Plan
@@ -41,17 +38,13 @@ from openpilot.sunnypilot.modeld_v2.modeld_base import ModelStateBase
 PROCESS_NAME = "selfdrive.modeld.modeld"
 SEND_RAW_PRED = os.getenv('SEND_RAW_PRED')

-VISION_PKL_PATH = MODELS_DIR / 'driving_vision_tinygrad.pkl'
 VISION_METADATA_PATH = MODELS_DIR / 'driving_vision_metadata.pkl'
-POLICY_PKL_PATH = MODELS_DIR / 'driving_policy_tinygrad.pkl'
 POLICY_METADATA_PATH = MODELS_DIR / 'driving_policy_metadata.pkl'

 LAT_SMOOTH_SECONDS = 0.0
 LONG_SMOOTH_SECONDS = 0.3
 MIN_LAT_CONTROL_SPEED = 0.3

-IMG_QUEUE_SHAPE = (6*(ModelConstants.MODEL_RUN_FREQ//ModelConstants.MODEL_CONTEXT_FREQ + 1), 128, 256)
-assert IMG_QUEUE_SHAPE[0] == 30


 def get_action_from_model(model_output: dict[str, np.ndarray], prev_action: log.ModelDataV2.Action,
@@ -86,108 +79,39 @@ class FrameMeta:
    if vipc is not None:
      self.frame_id, self.timestamp_sof, self.timestamp_eof = vipc.frame_id, vipc.timestamp_sof, vipc.timestamp_eof

-class InputQueues:
-  def __init__ (self, model_fps, env_fps, n_frames_input):
-    assert env_fps % model_fps == 0
-    assert env_fps >= model_fps
-    self.model_fps = model_fps
-    self.env_fps = env_fps
-    self.n_frames_input = n_frames_input
-
-    self.dtypes = {}
-    self.shapes = {}
-    self.q = {}
-
-  def update_dtypes_and_shapes(self, input_dtypes, input_shapes) -> None:
-    self.dtypes.update(input_dtypes)
-    if self.env_fps == self.model_fps:
-      self.shapes.update(input_shapes)
-    else:
-      for k in input_shapes:
-        shape = list(input_shapes[k])
-        if 'img' in k:
-          n_channels = shape[1] // self.n_frames_input
-          shape[1] = (self.env_fps // self.model_fps + (self.n_frames_input - 1)) * n_channels
-        else:
-          shape[1] = (self.env_fps // self.model_fps) * shape[1]
-        self.shapes[k] = tuple(shape)
-
-  def reset(self) -> None:
-    self.q = {k: np.zeros(self.shapes[k], dtype=self.dtypes[k]) for k in self.dtypes.keys()}
-
-  def enqueue(self, inputs:dict[str, np.ndarray]) -> None:
-    for k in inputs.keys():
-      if inputs[k].dtype != self.dtypes[k]:
-        raise ValueError(f'supplied input <{k}({inputs[k].dtype})> has wrong dtype, expected {self.dtypes[k]}')
-      input_shape = list(self.shapes[k])
-      input_shape[1] = -1
-      single_input = inputs[k].reshape(tuple(input_shape))
-      sz = single_input.shape[1]
-      self.q[k][:,:-sz] = self.q[k][:,sz:]
-      self.q[k][:,-sz:] = single_input
-
-  def get(self, *names) -> dict[str, np.ndarray]:
-    if self.env_fps == self.model_fps:
-      return {k: self.q[k] for k in names}
-    else:
-      out = {}
-      for k in names:
-        shape = self.shapes[k]
-        if 'img' in k:
-          n_channels = shape[1] // (self.env_fps // self.model_fps + (self.n_frames_input - 1))
-          out[k] = np.concatenate([self.q[k][:, s:s+n_channels] for s in np.linspace(0, shape[1] - n_channels, self.n_frames_input, dtype=int)], axis=1)
-        elif 'pulse' in k:
-          # any pulse within interval counts
-          out[k] = self.q[k].reshape((shape[0], shape[1] * self.model_fps // self.env_fps, self.env_fps // self.model_fps, -1)).max(axis=2)
-        else:
-          idxs = np.arange(-1, -shape[1], -self.env_fps // self.model_fps)[::-1]
-          out[k] = self.q[k][:, idxs]
-      return out

 class ModelState(ModelStateBase):
-  inputs: dict[str, np.ndarray]
-  output: np.ndarray
  prev_desire: np.ndarray  # for tracking the rising edge of the pulse

-  def __init__(self):
+  def __init__(self, cam_w: int, cam_h: int):
    ModelStateBase.__init__(self)
    self.LAT_SMOOTH_SECONDS = LAT_SMOOTH_SECONDS
+
    with open(VISION_METADATA_PATH, 'rb') as f:
      vision_metadata = pickle.load(f)
      self.vision_input_shapes =  vision_metadata['input_shapes']
      self.vision_input_names = list(self.vision_input_shapes.keys())
      self.vision_output_slices = vision_metadata['output_slices']
-      vision_output_size = vision_metadata['output_shapes']['outputs'][1]

    with open(POLICY_METADATA_PATH, 'rb') as f:
      policy_metadata = pickle.load(f)
      self.policy_input_shapes =  policy_metadata['input_shapes']
      self.policy_output_slices = policy_metadata['output_slices']
-      policy_output_size = policy_metadata['output_shapes']['outputs'][1]

    self.prev_desire = np.zeros(ModelConstants.DESIRE_LEN, dtype=np.float32)

-    # policy inputs
-    self.numpy_inputs = {k: np.zeros(self.policy_input_shapes[k], dtype=np.float32) for k in self.policy_input_shapes}
-    self.full_input_queues = InputQueues(ModelConstants.MODEL_CONTEXT_FREQ, ModelConstants.MODEL_RUN_FREQ, ModelConstants.N_FRAMES)
-    for k in ['desire_pulse', 'features_buffer']:
-      self.full_input_queues.update_dtypes_and_shapes({k: self.numpy_inputs[k].dtype}, {k: self.numpy_inputs[k].shape})
-    self.full_input_queues.reset()
-
-    self.img_queues = {'img': Tensor.zeros(IMG_QUEUE_SHAPE, dtype='uint8').contiguous().realize(),
-                       'big_img': Tensor.zeros(IMG_QUEUE_SHAPE, dtype='uint8').contiguous().realize()}
+    self.frame_skip = ModelConstants.MODEL_RUN_FREQ // ModelConstants.MODEL_CONTEXT_FREQ
+    self.input_queues, self.npy = make_input_queues(self.vision_input_shapes, self.policy_input_shapes, self.frame_skip)
    self.full_frames : dict[str, Tensor] = {}
    self._blob_cache : dict[int, Tensor] = {}
-    self.transforms_np = {k: np.zeros((3,3), dtype=np.float32) for k in self.img_queues}
-    self.transforms = {k: Tensor(v, device='NPY').realize() for k, v in self.transforms_np.items()}
-    self.vision_output = np.zeros(vision_output_size, dtype=np.float32)
-    self.policy_inputs = {k: Tensor(v, device='NPY').realize() for k,v in self.numpy_inputs.items()}
-    self.policy_output = np.zeros(policy_output_size, dtype=np.float32)
    self.parser = Parser()
-    self.frame_buf_params : dict[str, tuple[int, int, int, int]] = {}
-    self.update_imgs = None
-    self.vision_run = pickle.loads(read_file_chunked(str(VISION_PKL_PATH)))
-    self.policy_run = pickle.loads(read_file_chunked(str(POLICY_PKL_PATH)))
+    self.frame_buf_params = {k: get_nv12_info(cam_w, cam_h) for k in ('img', 'big_img')}
+    self.run_policy = pickle.loads(read_file_chunked(CompileConfig(cam_w, cam_h, prefix='driving_', prepare_only=False).pkl_path))
+    self.warp_enqueue = pickle.loads(read_file_chunked(CompileConfig(cam_w, cam_h, prefix='driving_', prepare_only=True).pkl_path))
+    self.warp_enqueue(
+      **self.input_queues,
+      frame=Tensor.zeros(self.frame_buf_params['img'][3], dtype='uint8').contiguous().realize(),
+      big_frame=Tensor.zeros(self.frame_buf_params['big_img'][3], dtype='uint8').contiguous().realize())

  def slice_outputs(self, model_outputs: np.ndarray, output_slices: dict[str, slice]) -> dict[str, np.ndarray]:
    parsed_model_outputs = {k: model_outputs[np.newaxis, v] for k,v in output_slices.items()}
@@ -195,18 +119,6 @@ class ModelState(ModelStateBase):

  def run(self, bufs: dict[str, VisionBuf], transforms: dict[str, np.ndarray],
                inputs: dict[str, np.ndarray], prepare_only: bool) -> dict[str, np.ndarray] | None:
-    # Model decides when action is completed, so desire input is just a pulse triggered on rising edge
-    inputs['desire_pulse'][0] = 0
-    new_desire = np.where(inputs['desire_pulse'] - self.prev_desire > .99, inputs['desire_pulse'], 0)
-    self.prev_desire[:] = inputs['desire_pulse']
-    if self.update_imgs is None:
-      for key in bufs.keys():
-        w, h = bufs[key].width, bufs[key].height
-        self.frame_buf_params[key] = get_nv12_info(w, h)
-      warp_path = MODELS_DIR / f'warp_{w}x{h}_tinygrad.pkl'
-      with open(warp_path, "rb") as f:
-        self.update_imgs = pickle.load(f)
-
    for key in bufs.keys():
      ptr = bufs[key].data.ctypes.data
      yuv_size = self.frame_buf_params[key][3]
@@ -215,30 +127,31 @@ class ModelState(ModelStateBase):
      if cache_key not in self._blob_cache:
        self._blob_cache[cache_key] = Tensor.from_blob(ptr, (yuv_size,), dtype='uint8')
      self.full_frames[key] = self._blob_cache[cache_key]
-    for key in bufs.keys():
-      self.transforms_np[key][:,:] = transforms[key][:,:]

-    out = self.update_imgs(self.img_queues['img'], self.full_frames['img'], self.transforms['img'],
-                           self.img_queues['big_img'], self.full_frames['big_img'], self.transforms['big_img'])
-    vision_inputs = {'img': out[0], 'big_img': out[1]}
+    # Model decides when action is completed, so desire input is just a pulse triggered on rising edge
+    inputs['desire_pulse'][0] = 0
+    self.npy['desire'][:] = np.where(inputs['desire_pulse'] - self.prev_desire > .99, inputs['desire_pulse'], 0)
+    self.prev_desire[:] = inputs['desire_pulse']
+    self.npy['traffic_convention'][:] = inputs['traffic_convention']
+    self.npy['tfm'][:,:] = transforms['img'][:,:]
+    self.npy['big_tfm'][:,:] = transforms['big_img'][:,:]

    if prepare_only:
+      self.warp_enqueue(**self.input_queues, frame=self.full_frames['img'], big_frame=self.full_frames['big_img'])
      return None

-    self.vision_output = self.vision_run(**vision_inputs).contiguous().realize().uop.base.buffer.numpy().flatten()
-    vision_outputs_dict = self.parser.parse_vision_outputs(self.slice_outputs(self.vision_output, self.vision_output_slices))
+    vision_output, policy_output = self.run_policy(
+      **self.input_queues, frame=self.full_frames['img'], big_frame=self.full_frames['big_img']
+    )

-    self.full_input_queues.enqueue({'features_buffer': vision_outputs_dict['hidden_state'], 'desire_pulse': new_desire})
-    for k in ['desire_pulse', 'features_buffer']:
-      self.numpy_inputs[k][:] = self.full_input_queues.get(k)[k]
-    self.numpy_inputs['traffic_convention'][:] = inputs['traffic_convention']
-
-    self.policy_output = self.policy_run(**self.policy_inputs).contiguous().realize().uop.base.buffer.numpy().flatten()
-    policy_outputs_dict = self.parser.parse_policy_outputs(self.slice_outputs(self.policy_output, self.policy_output_slices))
+    vision_output = vision_output.numpy().flatten()
+    policy_output = policy_output.numpy().flatten()
+    vision_outputs_dict = self.parser.parse_vision_outputs(self.slice_outputs(vision_output, self.vision_output_slices))
+    policy_outputs_dict = self.parser.parse_policy_outputs(self.slice_outputs(policy_output, self.policy_output_slices))
    combined_outputs_dict = {**vision_outputs_dict, **policy_outputs_dict}
-    if SEND_RAW_PRED:
-      combined_outputs_dict['raw_pred'] = np.concatenate([self.vision_output.copy(), self.policy_output.copy()])

+    if SEND_RAW_PRED:
+      combined_outputs_dict['raw_pred'] = np.concatenate([vision_output.copy(), policy_output.copy()])
    return combined_outputs_dict


@@ -250,11 +163,6 @@ def main(demo=False):
    # also need to move the aux USB interrupts for good timings
    config_realtime_process(7, 54)

-  st = time.monotonic()
-  cloudlog.warning("loading model")
-  model = ModelState()
-  cloudlog.warning(f"models loaded in {time.monotonic() - st:.1f}s, modeld starting")
-
  # visionipc clients
  while True:
    available_streams = VisionIpcClient.available_streams("camerad", block=False)
@@ -278,6 +186,11 @@ def main(demo=False):
  if use_extra_client:
    cloudlog.warning(f"connected extra cam with buffer size: {vipc_client_extra.buffer_len} ({vipc_client_extra.width} x {vipc_client_extra.height})")

+  st = time.monotonic()
+  cloudlog.warning("loading model")
+  model = ModelState(vipc_client_main.width, vipc_client_main.height)
+  cloudlog.warning(f"models loaded in {time.monotonic() - st:.1f}s, modeld starting")
+
  # messaging
  pm = PubMaster(["modelV2", "drivingModelData", "cameraOdometry", "modelDataV2SP"])
  sm = SubMaster(["deviceState", "carState", "roadCameraState", "liveCalibration", "driverMonitoringState", "carControl", "liveDelay"])
@@ -1,12 +0,0 @@
-import json
-import os
-from pathlib import Path
-
-MODELS_DIR = Path(__file__).parent / 'models'
-COMPILED_FLAGS_PATH = MODELS_DIR / 'tg_compiled_flags.json'
-
-
-def set_tinygrad_backend_from_compiled_flags() -> None:
-  if os.path.isfile(COMPILED_FLAGS_PATH):
-    with open(COMPILED_FLAGS_PATH) as f:
-      os.environ['DEV'] = str(json.load(f)['DEV'])
@@ -1,5 +1,6 @@
 import os
 import glob
+from tinygrad import Device

 Import('env', 'arch')
 lenv = env.Clone()
@@ -21,10 +22,19 @@ if PC:
  if outputs:
    lenv.Command(outputs, inputs, cmd)

-tg_flags = {
-  'larch64': 'DEV=QCOM FLOAT16=1 NOLOCALS=1 JIT_BATCH_SIZE=0',
-  'Darwin': f'DEV=CPU THREADS=0 HOME={os.path.expanduser("~")}',
-}.get(arch, 'DEV=CPU CPU_LLVM=1 THREADS=0')
+available = set(Device.get_available_devices())
+if 'CUDA' in available:
+  tg_backend = 'CUDA'
+  tg_flags = f'DEV={tg_backend}'
+elif 'QCOM' in available:
+  tg_backend = 'QCOM'
+  tg_flags = f'DEV={tg_backend} FLOAT16=1 NOLOCALS=1 JIT_BATCH_SIZE=0 OPENPILOT_HACKS=1'
+else:
+  tg_backend = 'CPU' if arch == 'Darwin' else 'CPU:LLVM'
+  # THREADS=0 is need to prevent bug: https://github.com/tinygrad/tinygrad/issues/14689
+  tg_flags = f'DEV={tg_backend} THREADS=0'
+
+mac_brew_string = f'HOME={os.path.expanduser("~")}' if arch == 'Darwin' else ''

 image_flag = {
  'larch64': 'IMAGE=2',
@@ -38,7 +48,7 @@ def tg_compile(flags, model_name):
  return lenv.Command(
    out,
    [fn + ".onnx"] + tinygrad_files,
-    f'{pythonpath_string} {flags} {image_flag} python3 {Dir("#tinygrad_repo").abspath}/examples/openpilot/compile3.py {fn}.onnx {out}'
+    f'{pythonpath_string} {tg_flags} {mac_brew_string} {image_flag} python3 {Dir("#tinygrad_repo").abspath}/examples/openpilot/compile3.py {fn}.onnx {out}'
  )

 # Compile models
@@ -46,9 +56,9 @@ for model_name in ['supercombo', 'driving_vision', 'driving_off_policy', 'drivin
  if File(f"models/{model_name}.onnx").exists():
    tg_compile(tg_flags, model_name)

-script_files = [File("warp.py"), File(Dir("#selfdrive/modeld").File("compile_warp.py").abspath)]
+script_files = [File("warp.py"), File(Dir("#selfdrive/modeld").File("compile_modeld.py").abspath)]
 pythonpath_string = 'PYTHONPATH="${PYTHONPATH}:' + env.Dir("#tinygrad_repo").abspath + ':' + env.Dir("#").abspath + '"'
-compile_warp_cmd = f'{pythonpath_string} {tg_flags} python3 -m sunnypilot.modeld_v2.warp'
+compile_warp_cmd = f'{pythonpath_string} {tg_flags} {mac_brew_string} {image_flag} python3 -m sunnypilot.modeld_v2.warp'

 from openpilot.common.transformations.camera import _ar_ox_fisheye, _os_fisheye
 warp_targets = []
@@ -2,8 +2,11 @@ import os
 os.environ['DEV'] = 'CPU'
 import pytest
 import numpy as np
-from openpilot.selfdrive.modeld.compile_warp import get_nv12_info, CAMERA_CONFIGS
-from openpilot.sunnypilot.modeld_v2.warp import Warp, MODEL_W, MODEL_H
+from openpilot.sunnypilot.modeld_v2.warp import CAMERA_CONFIGS
+from openpilot.system.camerad.cameras.nv12_info import get_nv12_info
+from openpilot.sunnypilot.modeld_v2.warp import Warp
+from openpilot.common.transformations.model import MEDMODEL_INPUT_SIZE
+MODEL_W, MODEL_H = MEDMODEL_INPUT_SIZE

 VISION_NAME_PAIRS = [  # needed to account for supercombos input_imgs
  ('img', 'big_img'),
@@ -6,29 +6,61 @@ from tinygrad.tensor import Tensor
 from tinygrad.engine.jit import TinyJit
 from tinygrad.device import Device

+# https://github.com/tinygrad/tinygrad/issues/15682
+from tinygrad.uop.ops import UOp, Ops
+_orig = UOp.__reduce__
+UOp.__reduce__ = lambda self: (UOp.unique, ()) if self.op is Ops.UNIQUE else _orig(self)
 from openpilot.system.camerad.cameras.nv12_info import get_nv12_info
-from openpilot.selfdrive.modeld.compile_warp import (
-  CAMERA_CONFIGS, MEDMODEL_INPUT_SIZE, make_frame_prepare, make_update_both_imgs,
-  warp_pkl_path,
+from openpilot.selfdrive.modeld.compile_modeld import (
+  NV12Frame, make_frame_prepare,
 )
+from openpilot.common.transformations.camera import _ar_ox_fisheye, _os_fisheye
+
+CAMERA_CONFIGS = [
+  (_ar_ox_fisheye.width, _ar_ox_fisheye.height),  # tici: 1928x1208
+  (_os_fisheye.width, _os_fisheye.height),        # mici: 1344x760
+]
+from openpilot.common.transformations.model import MEDMODEL_INPUT_SIZE

 MODELS_DIR = Path(__file__).parent / 'models'
-MODEL_W, MODEL_H = MEDMODEL_INPUT_SIZE
+
 UPSTREAM_BUFFER_LENGTH = 5

+def warp_pkl_path(cam_w, cam_h):
+  return MODELS_DIR / f'warp_{cam_w}x{cam_h}_tinygrad.pkl'
+
+def make_update_img_input(frame_prepare, model_w, model_h):
+  def update_img_input_tinygrad(tensor, frame, M_inv):
+    M_inv = M_inv.to(Device.DEFAULT)
+    new_img = frame_prepare(frame, M_inv)
+    tensor.assign(tensor[6:].cat(new_img, dim=0).contiguous())
+    return Tensor.cat(tensor[:6], tensor[-6:], dim=0).contiguous().reshape(1, 12, model_h//2, model_w//2)
+  return update_img_input_tinygrad
+
+def make_update_both_imgs(frame_prepare, model_w, model_h):
+  update_img = make_update_img_input(frame_prepare, model_w, model_h)
+
+  def update_both_imgs_tinygrad(calib_img_buffer, new_img, M_inv,
+                                calib_big_img_buffer, new_big_img, M_inv_big):
+    calib_img_pair = update_img(calib_img_buffer, new_img, M_inv)
+    calib_big_img_pair = update_img(calib_big_img_buffer, new_big_img, M_inv_big)
+    return calib_img_pair, calib_big_img_pair
+  return update_both_imgs_tinygrad
+

 def v2_warp_pkl_path(cam_w, cam_h, buffer_length):
  return MODELS_DIR / f'warp_{cam_w}x{cam_h}_b{buffer_length}_tinygrad.pkl'


-def compile_v2_warp(cam_w, cam_h, buffer_length):
+def compile_v2_warp(cam_w, cam_h, buffer_length, model_w=MEDMODEL_INPUT_SIZE[0], model_h=MEDMODEL_INPUT_SIZE[1]):
  _, _, _, yuv_size = get_nv12_info(cam_w, cam_h)
-  img_buffer_shape = (buffer_length * 6, MODEL_H // 2, MODEL_W // 2)
+  img_buffer_shape = (buffer_length * 6, model_h // 2, model_w // 2)

  print(f"Compiling v2 warp for {cam_w}x{cam_h} buffer_length={buffer_length}...")

-  frame_prepare = make_frame_prepare(cam_w, cam_h, MODEL_W, MODEL_H)
-  update_both_imgs = make_update_both_imgs(frame_prepare, MODEL_W, MODEL_H)
+  nv12 = NV12Frame(cam_w, cam_h, *get_nv12_info(cam_w, cam_h))
+  frame_prepare = make_frame_prepare(nv12, model_w, model_h)
+  update_both_imgs = make_update_both_imgs(frame_prepare, model_w, model_h)
  update_img_jit = TinyJit(update_both_imgs, prune=True)

  full_buffer = Tensor.zeros(img_buffer_shape, dtype='uint8').contiguous().realize()
@@ -62,9 +94,11 @@ def compile_v2_warp(cam_w, cam_h, buffer_length):


 class Warp:
-  def __init__(self, buffer_length=2):
+  def __init__(self, buffer_length=2, model_w=MEDMODEL_INPUT_SIZE[0], model_h=MEDMODEL_INPUT_SIZE[1]):
    self.buffer_length = buffer_length
-    self.img_buffer_shape = (buffer_length * 6, MODEL_H // 2, MODEL_W // 2)
+    self.model_w = model_w
+    self.model_h = model_h
+    self.img_buffer_shape = (buffer_length * 6, model_h // 2, model_w // 2)

    self.jit_cache = {}
    self.full_buffers = {k: Tensor.zeros(self.img_buffer_shape, dtype='uint8').contiguous().realize() for k in ['img', 'big_img']}
@@ -92,8 +126,9 @@ class Warp:
          with open(upstream_pkl, 'rb') as f:
            self.jit_cache[key] = pickle.load(f)
      if key not in self.jit_cache:
-        frame_prepare = make_frame_prepare(cam_w, cam_h, MODEL_W, MODEL_H)
-        update_both_imgs = make_update_both_imgs(frame_prepare, MODEL_W, MODEL_H)
+        nv12 = NV12Frame(cam_w, cam_h, *get_nv12_info(cam_w, cam_h))
+        frame_prepare = make_frame_prepare(nv12, self.model_w, self.model_h)
+        update_both_imgs = make_update_both_imgs(frame_prepare, self.model_w, self.model_h)
        self.jit_cache[key] = TinyJit(update_both_imgs, prune=True)

    if key not in self._nv12_cache:
@@ -116,7 +116,7 @@ class ModelCache:

 class ModelFetcher:
  """Handles fetching and caching of model data from remote source"""
-  MODEL_URL = "https://raw.githubusercontent.com/sunnypilot/sunnypilot-models/refs/heads/gh-pages/docs/driving_models_v16.json"
+  MODEL_URL = "https://raw.githubusercontent.com/sunnypilot/sunnypilot-models/refs/heads/gh-pages/docs/driving_models_v17.json"

  def __init__(self, params: Params):
    self.params = params
Author	SHA1	Message	Date
discountchubbs	85c1a79a14	torquey	2026-04-26 12:02:45 -07:00
discountchubbs	ced4a664cc	use upstreams	2026-04-26 11:18:51 -07:00
discountchubbs	03db277c22	dev	2026-04-26 11:15:34 -07:00
discountchubbs	11ed3800bf	ugh	2026-04-26 11:10:17 -07:00
discountchubbs	92526b878c	json v17	2026-04-26 11:07:17 -07:00
discountchubbs	66ff8ae52c	shebang	2026-04-26 00:33:45 -07:00
discountchubbs	d85cb76304	lint	2026-04-26 00:32:31 -07:00
James Vecellio-Grant	b4c613680e	Update SConscript	2026-04-26 00:29:30 -07:00
discountchubbs	f7511491f7	sync new modeld changes	2026-04-26 00:27:47 -07:00
discountchubbs	88b30e199b	fix to new tg	2026-04-25 23:47:26 -07:00
discountchubbs	2898f394dd	bump	2026-04-25 23:38:31 -07:00
discountchubbs	554cf9ca4a	modeld: sync tinygrad	2026-04-25 23:28:02 -07:00