bump old ref

Revert "Tinygrad runner (#34261 )"
This reverts commit 17ca6389
2026-06-21 23:22:07 +08:00 · 2025-01-05 08:51:31 -05:00 · 2025-01-05 08:48:27 -05:00 · 2025-01-05 05:14:18 -05:00 · 2025-01-05 10:47:28 +01:00 · 2025-01-05 02:27:44 -05:00
256 changed files with 9312 additions and 412 deletions
@@ -0,0 +1,2 @@
+Wen
+REGIST
@@ -86,7 +86,7 @@ jobs:
        run: >-
          sudo apt-get install -y imagemagick

-          scenes="homescreen settings_device settings_software settings_toggles settings_developer offroad_alert update_available prime onroad onroad_disengaged onroad_override onroad_sidebar onroad_wide onroad_wide_sidebar onroad_alert_small onroad_alert_mid onroad_alert_full driver_camera body keyboard"
+          scenes="homescreen settings_device settings_software settings_sunnylink settings_toggles settings_sunnypilot settings_sunnypilot_mads settings_developer offroad_alert update_available prime onroad onroad_disengaged onroad_override onroad_sidebar onroad_wide onroad_wide_sidebar onroad_alert_small onroad_alert_mid onroad_alert_full driver_camera body keyboard"
          A=($scenes)

          DIFF=""
@@ -74,6 +74,12 @@ AddOption('--minimal',
          default=os.path.exists(File('#.lfsconfig').abspath), # minimal by default on release branch (where there's no LFS)
          help='the minimum build to run openpilot. no tests, tools, etc.')

+AddOption('--stock-ui',
+          action='store_true',
+          dest='stock_ui',
+          default=False,
+          help='Build stock openpilot UI instead of sunnypilot UI')
+
 ## Architecture name breakdown (arch)
 ## - larch64: linux tici aarch64
 ## - aarch64: linux pc aarch64
@@ -172,6 +178,10 @@ else:
 if arch != "Darwin":
  ldflags += ["-Wl,--as-needed", "-Wl,--no-undefined"]

+if not GetOption('stock_ui'):
+  cflags += ["-DSUNNYPILOT"]
+  cxxflags += ["-DSUNNYPILOT"]
+
 ccflags_option = GetOption('ccflags')
 if ccflags_option:
  ccflags += ccflags_option.split(' ')
@@ -8,10 +8,72 @@ $Cxx.namespace("cereal");
 # cereal, so use these if you want custom events in your fork.

 # you can rename the struct, but don't change the identifier
-struct CustomReserved0 @0x81c2f05a394cf4af {
+struct SelfdriveStateSP @0x81c2f05a394cf4af {
+  mads @0 :ModularAssistiveDrivingSystem;
+
+  struct ModularAssistiveDrivingSystem {
+    state @0 :ModularAssistiveDrivingSystemState;
+    enabled @1 :Bool;
+    active @2 :Bool;
+    available @3 :Bool;
+
+    enum ModularAssistiveDrivingSystemState {
+      disabled @0;
+      paused @1;
+      enabled @2;
+      softDisabling @3;
+      overriding @4;
+    }
+  }
 }

-struct CustomReserved1 @0xaedffd8f31e7b55d {
+struct ModelManagerSP @0xaedffd8f31e7b55d {
+  activeBundle @0 :ModelBundle;
+  selectedBundle @1 :ModelBundle;
+  availableBundles @2 :List(ModelBundle);
+
+  struct DownloadUri {
+    uri @0 :Text;
+    sha256 @1 :Text;
+  }
+
+  enum Type {
+    drive @0;
+    navigation @1;
+    metadata @2;
+  }
+
+  struct Model {
+    fullName @0 :Text;
+    fileName @1 :Text;
+    downloadUri @2 :DownloadUri;
+    downloadProgress @3 :DownloadProgress;
+    type @4 :Type;
+  }
+
+  enum DownloadStatus {
+    notDownloading @0;
+    downloading @1;
+    downloaded @2;
+    cached @3;
+    failed @4;
+  }
+
+  struct DownloadProgress {
+    status @0 :DownloadStatus;
+    progress @1 :Float32;
+    eta @2 :UInt32;
+  }
+
+  struct ModelBundle {
+    index @0 :UInt32;
+    internalName @1 :Text;
+    displayName @2 :Text;
+    models @3 :List(Model);
+    status @4 :DownloadStatus;
+    generation @5 :UInt32;
+    environment @6 :Text;
+  }
 }

 struct CustomReserved2 @0xf35cc4560bbf6ec2 {
@@ -125,6 +125,79 @@ struct OnroadEvent @0xc4fa6047f024e718 {
    espActive @90;
    personalityChanged @91;
    aeb @92;
+    eventReserved93 @93;
+    eventReserved94 @94;
+    eventReserved95 @95;
+    eventReserved96 @96;
+    eventReserved97 @97;
+    eventReserved98 @98;
+    eventReserved99 @99;
+    eventReserved100 @100;
+    eventReserved101 @101;
+    eventReserved102 @102;
+    eventReserved103 @103;
+    eventReserved104 @104;
+    eventReserved105 @105;
+    eventReserved106 @106;
+    eventReserved107 @107;
+    eventReserved108 @108;
+    eventReserved109 @109;
+    eventReserved110 @110;
+    eventReserved111 @111;
+    eventReserved112 @112;
+    eventReserved113 @113;
+    eventReserved114 @114;
+    eventReserved115 @115;
+    eventReserved116 @116;
+    eventReserved117 @117;
+    eventReserved118 @118;
+    eventReserved119 @119;
+    eventReserved120 @120;
+    eventReserved121 @121;
+    eventReserved122 @122;
+    eventReserved123 @123;
+    eventReserved124 @124;
+    eventReserved125 @125;
+    eventReserved126 @126;
+    eventReserved127 @127;
+    eventReserved128 @128;
+    eventReserved129 @129;
+    eventReserved130 @130;
+    eventReserved131 @131;
+    eventReserved132 @132;
+    eventReserved133 @133;
+    eventReserved134 @134;
+    eventReserved135 @135;
+    eventReserved136 @136;
+    eventReserved137 @137;
+    eventReserved138 @138;
+    eventReserved139 @139;
+    eventReserved140 @140;
+    eventReserved141 @141;
+    eventReserved142 @142;
+    eventReserved143 @143;
+    eventReserved144 @144;
+    eventReserved145 @145;
+    eventReserved146 @146;
+    eventReserved147 @147;
+    eventReserved148 @148;
+    eventReserved149 @149;
+    eventReserved150 @150;
+
+    # sunnypilot
+    lkasEnable @151;
+    lkasDisable @152;
+    manualSteeringRequired @153;
+    manualLongitudinalRequired @154;
+    silentLkasEnable @155;
+    silentLkasDisable @156;
+    silentBrakeHold @157;
+    silentWrongGear @158;
+    silentReverseGear @159;
+    silentDoorOpen @160;
+    silentSeatbeltNotLatched @161;
+    silentParkBrake @162;
+    controlsMismatchLateral @163;

    soundsUnavailableDEPRECATED @47;
  }
@@ -589,6 +662,7 @@ struct PandaState @0xa7649e2575e4591e {

  # safety stuff
  controlsAllowed @3 :Bool;
+  controlsAllowedLat @5 :Bool;
  safetyRxInvalid @19 :UInt32;
  safetyTxBlocked @24 :UInt32;
  safetyModel @14 :Car.CarParams.SafetyModel;
@@ -696,7 +770,6 @@ struct PandaState @0xa7649e2575e4591e {
  }

  gasInterceptorDetectedDEPRECATED @4 :Bool;
-  startedSignalDetectedDEPRECATED @5 :Bool;
  hasGpsDEPRECATED @6 :Bool;
  gmlanSendErrsDEPRECATED @9 :UInt32;
  fanSpeedRpmDEPRECATED @11 :UInt16;
@@ -2558,8 +2631,8 @@ struct Event {
    customReservedRawData2 @126 :Data;

    # *********** Custom: reserved for forks ***********
-    customReserved0 @107 :Custom.CustomReserved0;
-    customReserved1 @108 :Custom.CustomReserved1;
+    selfdriveStateSP @107 :Custom.SelfdriveStateSP;
+    modelManagerSP @108 :Custom.ModelManagerSP;
    customReserved2 @109 :Custom.CustomReserved2;
    customReserved3 @110 :Custom.CustomReserved3;
    customReserved4 @111 :Custom.CustomReserved4;
@@ -74,6 +74,10 @@ _services: dict[str, tuple] = {
  "userFlag": (True, 0., 1),
  "microphone": (True, 10., 10),

+  # sunnypilot
+  "modelManagerSP": (False, 1., 1),
+  "selfdriveStateSP": (True, 100., 10),
+
  # debug
  "uiDebug": (True, 0., 1),
  "testJoystick": (True, 0.),
@@ -1,46 +1,27 @@
-import jwt
-import os
-import requests
-from datetime import datetime, timedelta, UTC
-from openpilot.system.hardware.hw import Paths
-from openpilot.system.version import get_version
+from openpilot.common.api.comma_connect import CommaConnectApi
+from sunnypilot.sunnylink.api import SunnylinkApi

-API_HOST = os.getenv('API_HOST', 'https://api.commadotai.com')

 class Api:
-  def __init__(self, dongle_id):
-    self.dongle_id = dongle_id
-    with open(Paths.persist_root()+'/comma/id_rsa') as f:
-      self.private_key = f.read()
+  def __init__(self, dongle_id, use_sunnylink=False):
+    if use_sunnylink:
+      self.service = SunnylinkApi(dongle_id)
+    else:
+      self.service = CommaConnectApi(dongle_id)
+
+  def request(self, method, endpoint, **params):
+    return self.service.request(method, endpoint, **params)

  def get(self, *args, **kwargs):
-    return self.request('GET', *args, **kwargs)
+    return self.service.get(*args, **kwargs)

  def post(self, *args, **kwargs):
-    return self.request('POST', *args, **kwargs)
-
-  def request(self, method, endpoint, timeout=None, access_token=None, **params):
-    return api_get(endpoint, method=method, timeout=timeout, access_token=access_token, **params)
+    return self.service.post(*args, **kwargs)

  def get_token(self, expiry_hours=1):
-    now = datetime.now(UTC).replace(tzinfo=None)
-    payload = {
-      'identity': self.dongle_id,
-      'nbf': now,
-      'iat': now,
-      'exp': now + timedelta(hours=expiry_hours)
-    }
-    token = jwt.encode(payload, self.private_key, algorithm='RS256')
-    if isinstance(token, bytes):
-      token = token.decode('utf8')
-    return token
+    return self.service.get_token(expiry_hours)


-def api_get(endpoint, method='GET', timeout=None, access_token=None, **params):
-  headers = {}
-  if access_token is not None:
-    headers['Authorization'] = "JWT " + access_token
-
-  headers['User-Agent'] = "openpilot-" + get_version()
-
-  return requests.request(method, API_HOST + "/" + endpoint, timeout=timeout, headers=headers, params=params)
+def api_get(endpoint, method='GET', timeout=None, access_token=None, use_sunnylink=False, **params):
+  return SunnylinkApi(None).api_get(endpoint, method, timeout, access_token, **params) if use_sunnylink \
+    else CommaConnectApi(None).api_get(endpoint, method, timeout, access_token, **params)
@@ -0,0 +1,56 @@
+import jwt
+import requests
+import unicodedata
+from datetime import datetime, timedelta, UTC
+from openpilot.system.hardware.hw import Paths
+from openpilot.system.version import get_version
+
+
+class BaseApi:
+  def __init__(self, dongle_id, api_host, user_agent="openpilot-"):
+    self.dongle_id = dongle_id
+    self.api_host = api_host
+    self.user_agent = user_agent
+    with open(f'{Paths.persist_root()}/comma/id_rsa') as f:
+      self.private_key = f.read()
+
+  def get(self, *args, **kwargs):
+    return self.request('GET', *args, **kwargs)
+
+  def post(self, *args, **kwargs):
+    return self.request('POST', *args, **kwargs)
+
+  def request(self, method, endpoint, timeout=None, access_token=None, **params):
+    return self.api_get(endpoint, method=method, timeout=timeout, access_token=access_token, **params)
+
+  def _get_token(self, expiry_hours=1, **extra_payload):
+    now = datetime.now(UTC).replace(tzinfo=None)
+    payload = {
+      'identity': self.dongle_id,
+      'nbf': now,
+      'iat': now,
+      'exp': now + timedelta(hours=expiry_hours),
+      **extra_payload
+    }
+    token = jwt.encode(payload, self.private_key, algorithm='RS256')
+    if isinstance(token, bytes):
+      token = token.decode('utf8')
+    return token
+
+  def get_token(self, expiry_hours=1):
+    return self._get_token(expiry_hours)
+
+  def remove_non_ascii_chars(self, text):
+    normalized_text = unicodedata.normalize('NFD', text)
+    ascii_encoded_text = normalized_text.encode('ascii', 'ignore')
+    return ascii_encoded_text.decode()
+
+  def api_get(self, endpoint, method='GET', timeout=None, access_token=None, **params):
+    headers = {}
+    if access_token is not None:
+      headers['Authorization'] = "JWT " + access_token
+
+    version = self.remove_non_ascii_chars(get_version())
+    headers['User-Agent'] = self.user_agent + version
+
+    return requests.request(method, f"{self.api_host}/{endpoint}", timeout=timeout, headers=headers, params=params)
@@ -0,0 +1,11 @@
+import os
+
+from openpilot.common.api.base import BaseApi
+
+API_HOST = os.getenv('API_HOST', 'https://api.commadotai.com')
+
+
+class CommaConnectApi(BaseApi):
+  def __init__(self, dongle_id):
+    super().__init__(dongle_id, API_HOST)
+    self.user_agent = "openpilot-"
@@ -109,36 +109,36 @@ std::unordered_map<std::string, uint32_t> keys = {
    {"CurrentBootlog", PERSISTENT},
    {"CurrentRoute", CLEAR_ON_MANAGER_START | CLEAR_ON_ONROAD_TRANSITION},
    {"DisableLogging", CLEAR_ON_MANAGER_START | CLEAR_ON_ONROAD_TRANSITION},
-    {"DisablePowerDown", PERSISTENT},
-    {"DisableUpdates", PERSISTENT},
-    {"DisengageOnAccelerator", PERSISTENT},
+    {"DisablePowerDown", PERSISTENT | BACKUP},
+    {"DisableUpdates", PERSISTENT | BACKUP},
+    {"DisengageOnAccelerator", PERSISTENT | BACKUP},
    {"DongleId", PERSISTENT},
    {"DoReboot", CLEAR_ON_MANAGER_START},
    {"DoShutdown", CLEAR_ON_MANAGER_START},
    {"DoUninstall", CLEAR_ON_MANAGER_START},
-    {"ExperimentalLongitudinalEnabled", PERSISTENT | DEVELOPMENT_ONLY},
-    {"ExperimentalMode", PERSISTENT},
-    {"ExperimentalModeConfirmed", PERSISTENT},
+    {"ExperimentalLongitudinalEnabled", PERSISTENT | DEVELOPMENT_ONLY | BACKUP},
+    {"ExperimentalMode", PERSISTENT | BACKUP},
+    {"ExperimentalModeConfirmed", PERSISTENT | BACKUP},
    {"FirmwareQueryDone", CLEAR_ON_MANAGER_START | CLEAR_ON_ONROAD_TRANSITION},
    {"ForcePowerDown", PERSISTENT},
    {"GitBranch", PERSISTENT},
    {"GitCommit", PERSISTENT},
    {"GitCommitDate", PERSISTENT},
    {"GitDiff", PERSISTENT},
-    {"GithubSshKeys", PERSISTENT},
-    {"GithubUsername", PERSISTENT},
+    {"GithubSshKeys", PERSISTENT | BACKUP},
+    {"GithubUsername", PERSISTENT | BACKUP},
    {"GitRemote", PERSISTENT},
-    {"GsmApn", PERSISTENT},
-    {"GsmMetered", PERSISTENT},
-    {"GsmRoaming", PERSISTENT},
+    {"GsmApn", PERSISTENT | BACKUP},
+    {"GsmMetered", PERSISTENT | BACKUP},
+    {"GsmRoaming", PERSISTENT | BACKUP},
    {"HardwareSerial", PERSISTENT},
    {"HasAcceptedTerms", PERSISTENT},
    {"IMEI", PERSISTENT},
    {"InstallDate", PERSISTENT},
    {"IsDriverViewEnabled", CLEAR_ON_MANAGER_START},
    {"IsEngaged", PERSISTENT},
-    {"IsLdwEnabled", PERSISTENT},
-    {"IsMetric", PERSISTENT},
+    {"IsLdwEnabled", PERSISTENT | BACKUP},
+    {"IsMetric", PERSISTENT | BACKUP},
    {"IsOffroad", CLEAR_ON_MANAGER_START},
    {"IsOnroad", PERSISTENT},
    {"IsRhdDetected", PERSISTENT},
@@ -146,7 +146,7 @@ std::unordered_map<std::string, uint32_t> keys = {
    {"IsTakingSnapshot", CLEAR_ON_MANAGER_START},
    {"IsTestedBranch", CLEAR_ON_MANAGER_START},
    {"JoystickDebugMode", CLEAR_ON_MANAGER_START | CLEAR_ON_OFFROAD_TRANSITION},
-    {"LanguageSetting", PERSISTENT},
+    {"LanguageSetting", PERSISTENT | BACKUP},
    {"LastAthenaPingTime", CLEAR_ON_MANAGER_START},
    {"LastGPSPosition", PERSISTENT},
    {"LastManagerExitReason", CLEAR_ON_MANAGER_START},
@@ -158,7 +158,7 @@ std::unordered_map<std::string, uint32_t> keys = {
    {"LiveTorqueParameters", PERSISTENT | DONT_LOG},
    {"LocationFilterInitialState", PERSISTENT},
    {"LongitudinalManeuverMode", CLEAR_ON_MANAGER_START | CLEAR_ON_OFFROAD_TRANSITION},
-    {"LongitudinalPersonality", PERSISTENT},
+    {"LongitudinalPersonality", PERSISTENT | BACKUP},
    {"NetworkMetered", PERSISTENT},
    {"ObdMultiplexingChanged", CLEAR_ON_MANAGER_START | CLEAR_ON_ONROAD_TRANSITION},
    {"ObdMultiplexingEnabled", CLEAR_ON_MANAGER_START | CLEAR_ON_ONROAD_TRANSITION},
@@ -174,17 +174,17 @@ std::unordered_map<std::string, uint32_t> keys = {
    {"Offroad_TemperatureTooHigh", CLEAR_ON_MANAGER_START},
    {"Offroad_UnofficialHardware", CLEAR_ON_MANAGER_START},
    {"Offroad_UpdateFailed", CLEAR_ON_MANAGER_START},
-    {"OpenpilotEnabledToggle", PERSISTENT},
+    {"OpenpilotEnabledToggle", PERSISTENT | BACKUP},
    {"PandaHeartbeatLost", CLEAR_ON_MANAGER_START | CLEAR_ON_OFFROAD_TRANSITION},
    {"PandaSomResetTriggered", CLEAR_ON_MANAGER_START | CLEAR_ON_OFFROAD_TRANSITION},
    {"PandaSignatures", CLEAR_ON_MANAGER_START},
    {"PrimeType", PERSISTENT},
-    {"RecordFront", PERSISTENT},
+    {"RecordFront", PERSISTENT | BACKUP},
    {"RecordFrontLock", PERSISTENT},  // for the internal fleet
-    {"SecOCKey", PERSISTENT | DONT_LOG},
+    {"SecOCKey", PERSISTENT | DONT_LOG},  // Candidate for | BACKUP
    {"RouteCount", PERSISTENT},
    {"SnoozeUpdate", CLEAR_ON_MANAGER_START | CLEAR_ON_OFFROAD_TRANSITION},
-    {"SshEnabled", PERSISTENT},
+    {"SshEnabled", PERSISTENT | BACKUP},
    {"TermsVersion", PERSISTENT},
    {"TrainingVersion", PERSISTENT},
    {"UbloxAvailable", PERSISTENT},
@@ -200,7 +200,28 @@ std::unordered_map<std::string, uint32_t> keys = {
    {"UpdaterTargetBranch", CLEAR_ON_MANAGER_START},
    {"UpdaterLastFetchTime", PERSISTENT},
    {"Version", PERSISTENT},
-    {"EnableGithubRunner", PERSISTENT},
+
+    // --- sunnypilot params --- //
+    {"EnableGithubRunner", PERSISTENT | BACKUP},
+
+    // MADS params
+    {"Mads", PERSISTENT | BACKUP},
+    {"MadsMainCruiseAllowed", PERSISTENT | BACKUP},
+    {"MadsPauseLateralOnBrake", PERSISTENT | BACKUP},
+    {"MadsUnifiedEngagementMode", PERSISTENT | BACKUP},
+
+    // Model Manager params
+    {"ModelManager_ActiveBundle", PERSISTENT},
+    {"ModelManager_DownloadIndex", CLEAR_ON_MANAGER_START | CLEAR_ON_OFFROAD_TRANSITION | CLEAR_ON_ONROAD_TRANSITION},
+    {"ModelManager_LastSyncTime", CLEAR_ON_MANAGER_START | CLEAR_ON_OFFROAD_TRANSITION},
+    {"ModelManager_ModelsCache", PERSISTENT | BACKUP},
+
+    // sunnylink params
+    {"EnableSunnylinkUploader", PERSISTENT | BACKUP},
+    {"LastSunnylinkPingTime", CLEAR_ON_MANAGER_START},
+    {"SunnylinkDongleId", PERSISTENT},
+    {"SunnylinkdPid", PERSISTENT},
+    {"SunnylinkEnabled", PERSISTENT},
 };

 } // namespace
@@ -16,6 +16,7 @@ enum ParamKeyType {
  CLEAR_ON_OFFROAD_TRANSITION = 0x10,
  DONT_LOG = 0x20,
  DEVELOPMENT_ONLY = 0x40,
+  BACKUP = 0x80,
  ALL = 0xFFFFFFFF
 };

@@ -42,7 +42,8 @@ dependencies = [

  # modeld
  "onnx >= 1.14.0",
-  "onnxruntime >=1.16.3",
+  "onnxruntime >=1.16.3; platform_system == 'Linux' and platform_machine == 'aarch64'",
+  "onnxruntime-gpu >=1.16.3; platform_system == 'Linux' and platform_machine == 'x86_64'",

  # logging
  "pyzmq",
@@ -163,6 +164,7 @@ testpaths = [
  "tools/replay",
  "tools/cabana",
  "cereal/messaging/tests",
+  "sunnypilot",
 ]

 [tool.codespell]
@@ -0,0 +1,361 @@
+#!/usr/bin/env python3
+import argparse
+import subprocess
+import sys
+import shutil
+import signal
+import contextlib
+import tempfile
+import os
+
+
+def run_command(command: str) -> tuple[int, str, str]:
+  """Run a shell command and return exit code, stdout, and stderr."""
+  process = subprocess.Popen(
+    command,
+    shell=True,
+    stdout=subprocess.PIPE,
+    stderr=subprocess.PIPE,
+    text=True
+  )
+  stdout, stderr = process.communicate()
+  return process.returncode, stdout.strip(), stderr.strip()
+
+
+def is_gh_available() -> bool:
+  """Check if GitHub CLI is available."""
+  return shutil.which('gh') is not None
+
+
+def get_current_branch() -> str | None:
+  """Get the name of the current git branch."""
+  code, output, error = run_command("git rev-parse --abbrev-ref HEAD")
+  if code != 0:
+    print(f"Error getting current branch: {error}")
+    return None
+  return output
+
+
+def backup_branch(branch_name: str) -> bool:
+  """Create a backup of the current branch."""
+  backup_name = f"{branch_name}-backup-$(date +%Y%m%d_%H%M%S)"
+  code, _, error = run_command(f"git branch {backup_name}")
+  if code != 0:
+    print(f"Error creating backup branch: {error}")
+    return False
+  print(f"Created backup branch: {backup_name}")
+  return True
+
+
+def get_commit_messages(source_branch: str, target_branch: str) -> list[str] | None:
+  """Get all commit messages between source and target branches."""
+  code, output, error = run_command(f"git log {target_branch}..{source_branch} --format=%B")
+  if code != 0:
+    print(f"Error getting commit messages: {error}")
+    return None
+  return [msg.strip() for msg in output.splitlines() if msg and not msg.startswith('Merge')]
+
+
+def get_pr_info(branch_name: str) -> str | None:
+  """Get PR title using GitHub CLI."""
+  if not is_gh_available():
+    print("Warning: GitHub CLI not found. Install it to auto-fetch PR titles:")
+    print("  https://cli.github.com/")
+    return None
+
+  # Try to get PR info using gh cli
+  code, output, error = run_command(f"gh pr view --json title --jq .title {branch_name}")
+  if code != 0:
+    print(f"No open PR found for branch '{branch_name}'")
+    return None
+
+  return output
+
+
+def create_squash_message(pr_title: str | None, commit_messages: list[str], source_branch: str) -> str:
+  """Create a squash commit message from PR title and commit messages."""
+  parts = []
+
+  # Add PR title if provided
+  if pr_title:
+    parts.append(pr_title)
+  else:
+    parts.append(f"Squashed changes from {source_branch}")
+  parts.append("")  # Empty line after title
+
+  # Add original commits section
+  if commit_messages:
+    parts.append("Original commits:")
+    parts.append("")  # Empty line before list
+    parts.extend(f"* {msg}" for msg in commit_messages)
+
+  return '\n'.join(parts)
+
+
+def prompt_for_title() -> str:
+  """Prompt user for a commit title."""
+  return input("Enter commit title (or press Enter to use default): ").strip()
+
+
+@contextlib.contextmanager
+def workspace_manager(original_branch: str):
+  """Context manager to handle workspace state and cleanup."""
+  stash_created = False
+  stash_restored = False
+  temp_branch: str | None = None
+
+  def cleanup_handler(signum=None, frame=None):
+    """Clean up workspace state."""
+    nonlocal temp_branch, stash_created, stash_restored
+    try:
+      if signum and stash_restored:
+        # If we're handling Ctrl+C but stash was already restored,
+        # just clean up branches and exit
+        current = get_current_branch()
+        if current and current != original_branch:
+          run_command(f"git checkout {original_branch}")
+        if temp_branch:
+          run_command(f"git branch -D {temp_branch}")
+        print("\nOperation interrupted, but changes were already restored.")
+        sys.exit(1)
+
+      # First, switch back to original branch
+      current = get_current_branch()
+      if current and current != original_branch:
+        run_command(f"git checkout {original_branch}")
+
+      # Then clean up temp branch
+      if temp_branch:
+        run_command(f"git branch -D {temp_branch}")
+
+      # Finally, restore stash if needed - AFTER switching branches
+      if stash_created and not stash_restored:
+        print("Restoring your uncommitted changes...")
+        code, stash_list, _ = run_command("git stash list")
+        if code == 0 and "Automatic stash by squash script" in stash_list:
+          run_command("git stash pop")
+          stash_restored = True
+          stash_created = False
+
+      if signum:
+        print("\nOperation interrupted. Cleaned up and restored original state.")
+        sys.exit(1)
+
+    except Exception as e:
+      print(f"Error during cleanup: {e}")
+      if signum:
+        sys.exit(1)
+
+  try:
+    # Set up signal handlers
+    signal.signal(signal.SIGINT, cleanup_handler)
+    signal.signal(signal.SIGTERM, cleanup_handler)
+
+    # Check for changes (including untracked files)
+    code, output, _ = run_command("git status --porcelain")
+    if output:
+      print("Stashing uncommitted changes...")
+      run_command("git stash push -u -m 'Automatic stash by squash script'")
+      stash_created = True
+
+    yield lambda x: setattr(x, 'temp_branch', temp_branch)
+
+  except Exception as e:
+    print(f"\nError occurred: {str(e)}")
+    cleanup_handler()
+    raise
+  finally:
+    cleanup_handler()
+
+
+def create_commit_with_message(message: str) -> bool:
+  """Create a commit with the given message using a temporary file."""
+  try:
+    with tempfile.NamedTemporaryFile(mode='w', delete=False) as f:
+      f.write(message)
+      temp_path = f.name
+
+    # Use the temporary file for the commit message
+    code, _, error = run_command(f"git commit -F {temp_path}")
+    os.unlink(temp_path)  # Clean up the temp file
+
+    if code != 0:
+      print(f"Error creating commit: {error}")
+      return False
+    return True
+  except Exception as e:
+    print(f"Error handling commit message: {e}")
+    if os.path.exists(temp_path):
+      os.unlink(temp_path)
+    return False
+
+
+def squash_and_merge(source_branch: str, target_branch: str, manual_title: str | None, backup: bool = False, push: bool = False) -> bool:
+  """
+  Squash the source branch and merge into target branch.
+  """
+  # Get original branch right away
+  original_branch = get_current_branch()
+  if not original_branch:
+    return False
+
+  class State:
+    temp_branch: str | None = None
+
+  state = State()
+
+  with workspace_manager(original_branch) as set_temp_branch:
+    # Validate source branch exists
+    code, _, error = run_command(f"git rev-parse --verify {source_branch}")
+    if code != 0:
+      print(f"Error: Source branch {source_branch} not found")
+      return False
+
+    if source_branch == target_branch:
+      print(f"Error: Source and target branches cannot be the same ({source_branch})")
+      return False
+
+    # Ensure target branch exists
+    code, _, error = run_command(f"git rev-parse --verify {target_branch}")
+    if code != 0:
+      print(f"Error: Target branch {target_branch} not found")
+      return False
+
+    # Find merge base
+    code, merge_base, error = run_command(f"git merge-base {target_branch} {source_branch}")
+    if code != 0:
+      print(f"Error finding merge base: {error}")
+      return False
+
+    # Create backup unless explicitly skipped
+    if backup and not backup_branch(source_branch):
+      return False
+
+    # Get commit messages
+    commit_messages = get_commit_messages(source_branch, target_branch)
+    if commit_messages is None:
+      return False
+
+    # Get title (priority: manual title > PR title > prompt user)
+    title = manual_title
+    if not title:
+      title = get_pr_info(source_branch)
+    if not title:
+      title = prompt_for_title()
+
+    try:
+      # Create and switch to temporary branch
+      temp_branch = f"temp-squash-{source_branch}"
+      state.temp_branch = temp_branch
+      set_temp_branch(state)
+
+      print(f"\nCreating temporary branch {temp_branch}...")
+      code, _, error = run_command(f"git checkout -b {temp_branch} {source_branch}")
+      if code != 0:
+        print(f"Error creating temp branch: {error}")
+        return False
+
+      print("Preparing squash by resetting temporary branch to merge base...")
+      code, _, error = run_command(f"git reset --soft {merge_base}")
+      if code != 0:
+        print(f"Error resetting for squash: {error}")
+        return False
+
+      # Create commit with message
+      print("Creating squash commit...")
+      squash_message = create_squash_message(title, commit_messages, source_branch)
+      if not create_commit_with_message(squash_message):
+        return False
+
+      # Switch to target and try merge
+      print(f"\nSwitching to target branch {target_branch}...")
+      code, _, error = run_command(f"git checkout {target_branch}")
+      if code != 0:
+        print(f"Error checking out target branch: {error}")
+        return False
+
+      print(f"Attempting to merge changes from {temp_branch}...")
+      code, _, error = run_command(f"git merge {temp_branch}")
+
+      if code != 0:
+        print(f"\nMerge failed with error: {error}")
+        print("\nThe squash was successful, and your changes are preserved in the temporary branch.")
+        print("To complete the merge manually, follow these steps:")
+        print(f"\n1. Your squashed changes are in branch: '{temp_branch}'")
+        print(f"2. The target branch is: '{target_branch}'")
+        print("\nTo resolve the conflicts:")
+        print(f"   git checkout {target_branch}")
+        print(f"   git merge {temp_branch}")
+        print("   # resolve conflicts in your editor")
+        print("   git add <resolved-files>")
+        print("   git commit")
+        print(f"   git push origin {target_branch}  # when ready to push")
+        print("\nTo clean up after successful merge:")
+        print(f"   git branch -D {temp_branch}")
+
+        # Make sure to abort the merge
+        print("\nAborting current merge attempt...")
+        run_command("git merge --abort")
+
+        # Return to original branch, but keep temp branch
+        print(f"Returning to {original_branch}...")
+        run_command(f"git checkout {original_branch}")
+        return False
+
+      # Clean up temp branch on success
+      run_command(f"git branch -D {temp_branch}")
+
+      # Push if requested
+      if push:
+        code, _, error = run_command(f"git push origin {target_branch}")
+        if code != 0:
+          print(f"Error pushing to {target_branch}: {error}")
+          return False
+        print(f"Successfully pushed to {target_branch}")
+      else:
+        print(f"Changes squashed and merged into {target_branch} locally")
+        print(f"To push the changes: git push origin {target_branch}")
+
+      # Return to original branch
+      code, _, error = run_command(f"git checkout {original_branch}")
+      if code != 0:
+        print(f"Warning: Failed to return to original branch: {error}")
+        return False
+
+      return True
+
+    except Exception as e:
+      print(f"Error during squash process: {e}")
+      return False
+
+
+def main():
+  parser = argparse.ArgumentParser(
+    description='Squash branch and merge into target branch'
+  )
+  parser.add_argument('--target', '-t', required=True,
+                      help='Target branch to merge changes into')
+  parser.add_argument('--source', '-s',
+                      help='Source branch to squash (default: current branch)')
+  parser.add_argument('--title', '-m',
+                      help='Optional manual title (overrides PR title)')
+  parser.add_argument('--backup', action='store_true',
+                      help='Creates a backup branch for the source branch')
+  parser.add_argument('--push', action='store_true',
+                      help='Push changes to remote after squashing')
+
+  args = parser.parse_args()
+
+  # Determine source branch early
+  source_branch = args.source
+  if not source_branch:
+    source_branch = get_current_branch()
+    if not source_branch:
+      sys.exit(1)
+
+  if not squash_and_merge(source_branch, args.target, args.title, args.backup, args.push):
+    sys.exit(1)
+
+
+if __name__ == "__main__":
+  main()
@@ -91,7 +91,7 @@ whitelist = [
  "tools/joystick/",
  "tools/longitudinal_maneuvers/",

-  "tinygrad_repo/examples/openpilot/compile3.py",
+  "tinygrad_repo/openpilot/compile2.py",
  "tinygrad_repo/extra/onnx.py",
  "tinygrad_repo/extra/onnx_ops.py",
  "tinygrad_repo/extra/thneed.py",
@@ -57,7 +57,7 @@ function run_tests() {

  if [[ -z "$FAST" ]]; then
    run "mypy" mypy $PYTHON_FILES
-    run "codespell" codespell $ALL_FILES
+    run "codespell" codespell $ALL_FILES --ignore-words=$ROOT/.codespellignore
  fi

  return $FAILED
@@ -22,6 +22,8 @@ from openpilot.selfdrive.pandad import can_capnp_to_list, can_list_to_can_capnp
 from openpilot.selfdrive.car.cruise import VCruiseHelper
 from openpilot.selfdrive.car.car_specific import MockCarState

+from openpilot.sunnypilot.mads.mads import MadsParams
+
 REPLAY = "REPLAY" in os.environ

 EventName = log.OnroadEvent.EventName
@@ -113,6 +115,10 @@ class Car:
    if not disengage_on_accelerator:
      self.CP.alternativeExperience |= ALTERNATIVE_EXPERIENCE.DISABLE_DISENGAGE_ON_GAS

+    # mads
+    MadsParams().set_alternative_experience(self.CP)
+    MadsParams().set_car_specific_params(self.CP)
+
    openpilot_enabled_toggle = self.params.get_bool("OpenpilotEnabledToggle")

    controller_available = self.CI.CC is not None and openpilot_enabled_toggle and not self.CP.dashcamOnly
@@ -19,6 +19,7 @@ from openpilot.selfdrive.controls.lib.longcontrol import LongControl
 from openpilot.selfdrive.controls.lib.vehicle_model import VehicleModel
 from openpilot.selfdrive.locationd.helpers import PoseCalibrator, Pose

+from opendbc.sunnypilot import SunnypilotParamFlags

 State = log.SelfdriveState.OpenpilotState
 LaneChangeState = log.LaneChangeState
@@ -56,6 +57,9 @@ class Controls:
    elif self.CP.lateralTuning.which() == 'torque':
      self.LaC = LatControlTorque(self.CP, self.CI)

+    data_services = list(self.sm.data.keys()) + ['selfdriveStateSP']
+    self.sm = messaging.SubMaster(data_services, poll='selfdriveState')
+
  def update(self):
    self.sm.update(15)
    if self.sm.updated["liveCalibration"]:
@@ -88,7 +92,16 @@ class Controls:

    # Check which actuators can be enabled
    standstill = abs(CS.vEgo) <= max(self.CP.minSteerSpeed, MIN_LATERAL_CONTROL_SPEED) or CS.standstill
-    CC.latActive = self.sm['selfdriveState'].active and not CS.steerFaultTemporary and not CS.steerFaultPermanent and not standstill
+
+    ss_sp = self.sm['selfdriveStateSP']
+    CC.madsEnabled = ss_sp.mads.enabled
+    if ss_sp.mads.available:
+      CC.sunnypilotParams |= SunnypilotParamFlags.ENABLE_MADS.value
+      _lat_active = ss_sp.mads.active
+    else:
+      _lat_active = self.sm['selfdriveState'].active
+
+    CC.latActive = _lat_active and not CS.steerFaultTemporary and not CS.steerFaultPermanent and not standstill
    CC.longActive = CC.enabled and not any(e.overrideLongitudinal for e in self.sm['onroadEvents']) and self.CP.openpilotLongitudinalControl

    actuators = CC.actuators
@@ -13,6 +13,20 @@ common_src = [
  "transforms/transform.cc",
 ]

+thneed_src_common = [
+  "thneed/thneed_common.cc",
+  "thneed/serialize.cc",
+]
+
+thneed_src_qcom = thneed_src_common + ["thneed/thneed_qcom2.cc"]
+thneed_src_pc = thneed_src_common + ["thneed/thneed_pc.cc"]
+thneed_src = thneed_src_qcom if arch == "larch64" else thneed_src_pc
+
+# SNPE except on Mac and ARM Linux
+snpe_lib = []
+if arch != "Darwin" and arch != "aarch64":
+  common_src += ['runners/snpemodel.cc']
+  snpe_lib += ['SNPE']

 # OpenCL is a framework on Mac
 if arch == "Darwin":
@@ -31,24 +45,34 @@ snpe_rpath_pc = f"{Dir('#').abspath}/third_party/snpe/x86_64-linux-clang"
 snpe_rpath = lenvCython['RPATH'] + [snpe_rpath_qcom if arch == "larch64" else snpe_rpath_pc]

 cython_libs = envCython["LIBS"] + libs
+snpemodel_lib = lenv.Library('snpemodel', ['runners/snpemodel.cc'])
 commonmodel_lib = lenv.Library('commonmodel', common_src)
+
+lenvCython.Program('runners/runmodel_pyx.so', 'runners/runmodel_pyx.pyx', LIBS=cython_libs, FRAMEWORKS=frameworks)
+lenvCython.Program('runners/snpemodel_pyx.so', 'runners/snpemodel_pyx.pyx', LIBS=[snpemodel_lib, snpe_lib, *cython_libs], FRAMEWORKS=frameworks, RPATH=snpe_rpath)
 lenvCython.Program('models/commonmodel_pyx.so', 'models/commonmodel_pyx.pyx', LIBS=[commonmodel_lib, *cython_libs], FRAMEWORKS=frameworks)
-tinygrad_files = ["#"+x for x in glob.glob(env.Dir("#tinygrad_repo").relpath + "/**", recursive=True, root_dir=env.Dir("#").abspath) if 'pycache' not in x]
+
+tinygrad_files = ["#"+x for x in glob.glob(env.Dir("#tinygrad_repo").relpath + "/**", recursive=True, root_dir=env.Dir("#").abspath)]

 # Get model metadata
 fn = File("models/supercombo").abspath
 cmd = f'python3 {Dir("#selfdrive/modeld").abspath}/get_model_metadata.py {fn}.onnx'
 lenv.Command(fn + "_metadata.pkl", [fn + ".onnx"] + tinygrad_files, cmd)

-# Compile tinygrad model
-pythonpath_string = 'PYTHONPATH="${PYTHONPATH}:' + env.Dir("#tinygrad_repo").abspath + '"'
-if arch == 'larch64':
-  device_string = 'QCOM=1'
-else:
-  device_string = 'CLANG=1 IMAGE=0'
+# Build thneed model
+if arch == "larch64" or GetOption('pc_thneed'):
+  tinygrad_opts = []
+  if not GetOption('pc_thneed'):
+    # use FLOAT16 on device for speed + don't cache the CL kernels for space
+    tinygrad_opts += ["FLOAT16=1", "PYOPENCL_NO_CACHE=1"]
+  cmd = f"cd {Dir('#').abspath}/tinygrad_repo && " + ' '.join(tinygrad_opts) + f" python3 openpilot/compile2.py {fn}.onnx {fn}.thneed"

-for model_name in ['supercombo', 'dmonitoring_model']:
-  fn = File(f"models/{model_name}").abspath
-  cmd = f'{pythonpath_string} {device_string} python3 {Dir("#tinygrad_repo").abspath}/examples/openpilot/compile3.py {fn}.onnx {fn}_tinygrad.pkl'
-  lenv.Command(fn + "_tinygrad.pkl", [fn + ".onnx"] + tinygrad_files, cmd)
+  lenv.Command(fn + ".thneed", [fn + ".onnx"] + tinygrad_files, cmd)

+  fn_dm = File("models/dmonitoring_model").abspath
+  cmd = f"cd {Dir('#').abspath}/tinygrad_repo && " + ' '.join(tinygrad_opts) + f" python3 openpilot/compile2.py {fn_dm}.onnx {fn_dm}.thneed"
+  lenv.Command(fn_dm + ".thneed", [fn_dm + ".onnx"] + tinygrad_files, cmd)
+
+  thneed_lib = env.SharedLibrary('thneed', thneed_src, LIBS=[gpucommon, common, 'OpenCL', 'dl'])
+  thneedmodel_lib = env.Library('thneedmodel', ['runners/thneedmodel.cc'])
+  lenvCython.Program('runners/thneedmodel_pyx.so', 'runners/thneedmodel_pyx.pyx', LIBS=envCython["LIBS"]+[thneedmodel_lib, thneed_lib, gpucommon, common, 'dl', 'OpenCL'])
@@ -1,4 +1,10 @@
 #!/usr/bin/env bash

 DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null && pwd)"
+cd "$DIR/../../"
+
+if [ -f "$DIR/libthneed.so" ]; then
+  export LD_PRELOAD="$DIR/libthneed.so"
+fi
+
 exec "$DIR/dmonitoringmodeld.py" "$@"
@@ -1,17 +1,8 @@
 #!/usr/bin/env python3
 import os
-from openpilot.system.hardware import TICI
-if TICI:
-  from tinygrad.tensor import Tensor
-  from tinygrad.dtype import dtypes
-  from openpilot.selfdrive.modeld.runners.tinygrad_helpers import qcom_tensor_from_opencl_address
-  os.environ['QCOM'] = '1'
-else:
-  from openpilot.selfdrive.modeld.runners.ort_helpers import make_onnx_cpu_runner
 import gc
 import math
 import time
-import pickle
 import ctypes
 import numpy as np
 from pathlib import Path
@@ -22,20 +13,21 @@ from cereal.messaging import PubMaster, SubMaster
 from msgq.visionipc import VisionIpcClient, VisionStreamType, VisionBuf
 from openpilot.common.swaglog import cloudlog
 from openpilot.common.realtime import set_realtime_priority
-from openpilot.common.transformations.model import dmonitoringmodel_intrinsics, DM_INPUT_SIZE
+from openpilot.common.transformations.model import dmonitoringmodel_intrinsics
 from openpilot.common.transformations.camera import _ar_ox_fisheye, _os_fisheye
 from openpilot.selfdrive.modeld.models.commonmodel_pyx import CLContext, MonitoringModelFrame
+from openpilot.selfdrive.modeld.runners import ModelRunner, Runtime
 from openpilot.selfdrive.modeld.parse_model_outputs import sigmoid

-MODEL_WIDTH, MODEL_HEIGHT = DM_INPUT_SIZE
 CALIB_LEN = 3
 FEATURE_LEN = 512
 OUTPUT_SIZE = 84 + FEATURE_LEN

 PROCESS_NAME = "selfdrive.modeld.dmonitoringmodeld"
 SEND_RAW_PRED = os.getenv('SEND_RAW_PRED')
-MODEL_PATH = Path(__file__).parent / 'models/dmonitoring_model.onnx'
-MODEL_PKL_PATH = Path(__file__).parent / 'models/dmonitoring_model_tinygrad.pkl'
+MODEL_PATHS = {
+  ModelRunner.THNEED: Path(__file__).parent / 'models/dmonitoring_model.thneed',
+  ModelRunner.ONNX: Path(__file__).parent / 'models/dmonitoring_model.onnx'}

 class DriverStateResult(ctypes.Structure):
  _fields_ = [
@@ -66,42 +58,29 @@ class DMonitoringModelResult(ctypes.Structure):
 class ModelState:
  inputs: dict[str, np.ndarray]
  output: np.ndarray
+  model: ModelRunner

  def __init__(self, cl_ctx):
    assert ctypes.sizeof(DMonitoringModelResult) == OUTPUT_SIZE * ctypes.sizeof(ctypes.c_float)

    self.frame = MonitoringModelFrame(cl_ctx)
-    self.numpy_inputs = {
-      'calib': np.zeros((1, CALIB_LEN), dtype=np.float32),
-    }
+    self.output = np.zeros(OUTPUT_SIZE, dtype=np.float32)
+    self.inputs = {
+      'calib': np.zeros(CALIB_LEN, dtype=np.float32)}

-    if TICI:
-      self.tensor_inputs = {k: Tensor(v, device='NPY').realize() for k,v in self.numpy_inputs.items()}
-      with open(MODEL_PKL_PATH, "rb") as f:
-        self.model_run = pickle.load(f)
-    else:
-      self.onnx_cpu_runner = make_onnx_cpu_runner(MODEL_PATH)
+    self.model = ModelRunner(MODEL_PATHS, self.output, Runtime.GPU, False, cl_ctx)
+    self.model.addInput("input_img", None)
+    self.model.addInput("calib", self.inputs['calib'])

  def run(self, buf:VisionBuf, calib:np.ndarray, transform:np.ndarray) -> tuple[np.ndarray, float]:
-    self.numpy_inputs['calib'][0,:] = calib
+    self.inputs['calib'][:] = calib
+
+    self.model.setInputBuffer("input_img", self.frame.prepare(buf, transform.flatten(), None).view(np.float32))

    t1 = time.perf_counter()
-
-    input_img_cl = self.frame.prepare(buf, transform.flatten())
-    if TICI:
-      # The imgs tensors are backed by opencl memory, only need init once
-      if 'input_img' not in self.tensor_inputs:
-        self.tensor_inputs['input_img'] = qcom_tensor_from_opencl_address(input_img_cl.mem_address, (1, MODEL_WIDTH*MODEL_HEIGHT), dtype=dtypes.uint8)
-    else:
-      self.numpy_inputs['input_img'] = self.frame.buffer_from_cl(input_img_cl).reshape((1, MODEL_WIDTH*MODEL_HEIGHT))
-
-    if TICI:
-      output = self.model_run(**self.tensor_inputs).numpy().flatten()
-    else:
-      output = self.onnx_cpu_runner.run(None, self.numpy_inputs)[0].flatten()
-
+    self.model.execute()
    t2 = time.perf_counter()
-    return output, t2 - t1
+    return self.output, t2 - t1


 def fill_driver_state(msg, ds_result: DriverStateResult):
@@ -1,4 +1,10 @@
 #!/usr/bin/env bash

 DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null && pwd)"
+cd "$DIR/../../"
+
+if [ -f "$DIR/libthneed.so" ]; then
+  export LD_PRELOAD="$DIR/libthneed.so"
+fi
+
 exec "$DIR/modeld.py" "$@"
@@ -1,15 +1,5 @@
 #!/usr/bin/env python3
 import os
-from openpilot.system.hardware import TICI
-
-#
-if TICI:
-  from tinygrad.tensor import Tensor
-  from tinygrad.dtype import dtypes
-  from openpilot.selfdrive.modeld.runners.tinygrad_helpers import qcom_tensor_from_opencl_address
-  os.environ['QCOM'] = '1'
-else:
-  from openpilot.selfdrive.modeld.runners.ort_helpers import make_onnx_cpu_runner
 import time
 import pickle
 import numpy as np
@@ -28,19 +18,22 @@ from openpilot.common.transformations.camera import DEVICE_CAMERAS
 from openpilot.common.transformations.model import get_warp_matrix
 from openpilot.system import sentry
 from openpilot.selfdrive.controls.lib.desire_helper import DesireHelper
+from openpilot.selfdrive.modeld.runners import ModelRunner, Runtime
 from openpilot.selfdrive.modeld.parse_model_outputs import Parser
 from openpilot.selfdrive.modeld.fill_model_msg import fill_model_msg, fill_pose_msg, PublishState
 from openpilot.selfdrive.modeld.constants import ModelConstants
 from openpilot.selfdrive.modeld.models.commonmodel_pyx import DrivingModelFrame, CLContext

-
 PROCESS_NAME = "selfdrive.modeld.modeld"
 SEND_RAW_PRED = os.getenv('SEND_RAW_PRED')

-MODEL_PATH = Path(__file__).parent / 'models/supercombo.onnx'
-MODEL_PKL_PATH = Path(__file__).parent / 'models/supercombo_tinygrad.pkl'
+MODEL_PATHS = {
+  ModelRunner.THNEED: Path(__file__).parent / 'models/supercombo.thneed',
+  ModelRunner.ONNX: Path(__file__).parent / 'models/supercombo.onnx'}
+
 METADATA_PATH = Path(__file__).parent / 'models/supercombo_metadata.pkl'

+
 class FrameMeta:
  frame_id: int = 0
  timestamp_sof: int = 0
@@ -51,39 +44,40 @@ class FrameMeta:
      self.frame_id, self.timestamp_sof, self.timestamp_eof = vipc.frame_id, vipc.timestamp_sof, vipc.timestamp_eof

 class ModelState:
-  frames: dict[str, DrivingModelFrame]
+  frame: DrivingModelFrame
+  wide_frame: DrivingModelFrame
  inputs: dict[str, np.ndarray]
  output: np.ndarray
  prev_desire: np.ndarray  # for tracking the rising edge of the pulse
+  model: ModelRunner

  def __init__(self, context: CLContext):
-    self.frames = {'input_imgs': DrivingModelFrame(context), 'big_input_imgs': DrivingModelFrame(context)}
+    self.frame = DrivingModelFrame(context)
+    self.wide_frame = DrivingModelFrame(context)
    self.prev_desire = np.zeros(ModelConstants.DESIRE_LEN, dtype=np.float32)
    self.full_features_20Hz = np.zeros((ModelConstants.FULL_HISTORY_BUFFER_LEN, ModelConstants.FEATURE_LEN), dtype=np.float32)
    self.desire_20Hz =  np.zeros((ModelConstants.FULL_HISTORY_BUFFER_LEN + 1, ModelConstants.DESIRE_LEN), dtype=np.float32)

    # img buffers are managed in openCL transform code
-    self.numpy_inputs = {
-      'desire': np.zeros((1, (ModelConstants.HISTORY_BUFFER_LEN+1), ModelConstants.DESIRE_LEN), dtype=np.float32),
-      'traffic_convention': np.zeros((1, ModelConstants.TRAFFIC_CONVENTION_LEN), dtype=np.float32),
-      'features_buffer': np.zeros((1, ModelConstants.HISTORY_BUFFER_LEN,  ModelConstants.FEATURE_LEN), dtype=np.float32),
+    self.inputs = {
+      'desire': np.zeros(ModelConstants.DESIRE_LEN * (ModelConstants.HISTORY_BUFFER_LEN+1), dtype=np.float32),
+      'traffic_convention': np.zeros(ModelConstants.TRAFFIC_CONVENTION_LEN, dtype=np.float32),
+      'features_buffer': np.zeros(ModelConstants.HISTORY_BUFFER_LEN * ModelConstants.FEATURE_LEN, dtype=np.float32),
    }

    with open(METADATA_PATH, 'rb') as f:
      model_metadata = pickle.load(f)
-    self.input_shapes =  model_metadata['input_shapes']

    self.output_slices = model_metadata['output_slices']
    net_output_size = model_metadata['output_shapes']['outputs'][1]
    self.output = np.zeros(net_output_size, dtype=np.float32)
    self.parser = Parser()

-    if TICI:
-      self.tensor_inputs = {k: Tensor(v, device='NPY').realize() for k,v in self.numpy_inputs.items()}
-      with open(MODEL_PKL_PATH, "rb") as f:
-        self.model_run = pickle.load(f)
-    else:
-      self.onnx_cpu_runner = make_onnx_cpu_runner(MODEL_PATH)
+    self.model = ModelRunner(MODEL_PATHS, self.output, Runtime.GPU, False, context)
+    self.model.addInput("input_imgs", None)
+    self.model.addInput("big_input_imgs", None)
+    for k,v in self.inputs.items():
+      self.model.addInput(k, v)

  def slice_outputs(self, model_outputs: np.ndarray) -> dict[str, np.ndarray]:
    parsed_model_outputs = {k: model_outputs[np.newaxis, v] for k,v in self.output_slices.items()}
@@ -100,36 +94,24 @@ class ModelState:

    self.desire_20Hz[:-1] = self.desire_20Hz[1:]
    self.desire_20Hz[-1] = new_desire
-    self.numpy_inputs['desire'][:] = self.desire_20Hz.reshape((1,25,4,-1)).max(axis=2)
+    self.inputs['desire'][:] = self.desire_20Hz.reshape((25,4,-1)).max(axis=1).flatten()

-    self.numpy_inputs['traffic_convention'][:] = inputs['traffic_convention']
-    imgs_cl = {'input_imgs': self.frames['input_imgs'].prepare(buf, transform.flatten()),
-               'big_input_imgs': self.frames['big_input_imgs'].prepare(wbuf, transform_wide.flatten())}
+    self.inputs['traffic_convention'][:] = inputs['traffic_convention']

-    if TICI:
-      # The imgs tensors are backed by opencl memory, only need init once
-      for key in imgs_cl:
-        if key not in self.tensor_inputs:
-          self.tensor_inputs[key] = qcom_tensor_from_opencl_address(imgs_cl[key].mem_address, self.input_shapes[key], dtype=dtypes.uint8)
-    else:
-      for key in imgs_cl:
-        self.numpy_inputs[key] = self.frames[key].buffer_from_cl(imgs_cl[key]).reshape(self.input_shapes[key])
+    self.model.setInputBuffer("input_imgs", self.frame.prepare(buf, transform.flatten(), self.model.getCLBuffer("input_imgs")))
+    self.model.setInputBuffer("big_input_imgs", self.wide_frame.prepare(wbuf, transform_wide.flatten(), self.model.getCLBuffer("big_input_imgs")))

    if prepare_only:
      return None

-    if TICI:
-      self.output = self.model_run(**self.tensor_inputs).numpy().flatten()
-    else:
-      self.output = self.onnx_cpu_runner.run(None, self.numpy_inputs)[0].flatten()
-
+    self.model.execute()
    outputs = self.parser.parse_outputs(self.slice_outputs(self.output))

    self.full_features_20Hz[:-1] = self.full_features_20Hz[1:]
    self.full_features_20Hz[-1] = outputs['hidden_state'][0, :]

    idxs = np.arange(-4,-100,-4)[::-1]
-    self.numpy_inputs['features_buffer'][:] = self.full_features_20Hz[idxs]
+    self.inputs['features_buffer'][:] = self.full_features_20Hz[idxs].flatten()
    return outputs


@@ -299,6 +281,7 @@ def main(demo=False):
      pm.send('modelV2', modelv2_send)
      pm.send('drivingModelData', drivingdata_send)
      pm.send('cameraOdometry', posenet_send)
+
    last_vipc_frame_id = meta_main.frame_id


@@ -7,7 +7,7 @@

 DrivingModelFrame::DrivingModelFrame(cl_device_id device_id, cl_context context) : ModelFrame(device_id, context) {
  input_frames = std::make_unique<uint8_t[]>(buf_size);
-  input_frames_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, buf_size, NULL, &err));
+  //input_frames_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, buf_size, NULL, &err));
  img_buffer_20hz_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, 5*frame_size_bytes, NULL, &err));
  region.origin = 4 * frame_size_bytes;
  region.size = frame_size_bytes;
@@ -17,7 +17,7 @@ DrivingModelFrame::DrivingModelFrame(cl_device_id device_id, cl_context context)
  init_transform(device_id, context, MODEL_WIDTH, MODEL_HEIGHT);
 }

-cl_mem* DrivingModelFrame::prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection) {
+uint8_t* DrivingModelFrame::prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection, cl_mem* output) {
  run_transform(yuv_cl, MODEL_WIDTH, MODEL_HEIGHT, frame_width, frame_height, frame_stride, frame_uv_offset, projection);

  for (int i = 0; i < 4; i++) {
@@ -25,12 +25,19 @@ cl_mem* DrivingModelFrame::prepare(cl_mem yuv_cl, int frame_width, int frame_hei
  }
  loadyuv_queue(&loadyuv, q, y_cl, u_cl, v_cl, last_img_cl);

-  copy_queue(&loadyuv, q, img_buffer_20hz_cl, input_frames_cl, 0, 0, frame_size_bytes);
-  copy_queue(&loadyuv, q, last_img_cl, input_frames_cl, 0, frame_size_bytes, frame_size_bytes);
+  if (output == NULL) {
+    CL_CHECK(clEnqueueReadBuffer(q, img_buffer_20hz_cl, CL_TRUE, 0, frame_size_bytes, &input_frames[0], 0, nullptr, nullptr));
+    CL_CHECK(clEnqueueReadBuffer(q, last_img_cl, CL_TRUE, 0, frame_size_bytes, &input_frames[MODEL_FRAME_SIZE], 0, nullptr, nullptr));
+    clFinish(q);
+    return &input_frames[0];
+  } else {
+    copy_queue(&loadyuv, q, img_buffer_20hz_cl, *output, 0, 0, frame_size_bytes);
+    copy_queue(&loadyuv, q, last_img_cl, *output, 0, frame_size_bytes, frame_size_bytes);

-  // NOTE: Since thneed is using a different command queue, this clFinish is needed to ensure the image is ready.
-  clFinish(q);
-  return &input_frames_cl;
+    // NOTE: Since thneed is using a different command queue, this clFinish is needed to ensure the image is ready.
+    clFinish(q);
+    return NULL;
+  }
 }

 DrivingModelFrame::~DrivingModelFrame() {
@@ -44,15 +51,16 @@ DrivingModelFrame::~DrivingModelFrame() {

 MonitoringModelFrame::MonitoringModelFrame(cl_device_id device_id, cl_context context) : ModelFrame(device_id, context) {
  input_frames = std::make_unique<uint8_t[]>(buf_size);
-  input_frame_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, buf_size, NULL, &err));
+  //input_frame_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, buf_size, NULL, &err));

  init_transform(device_id, context, MODEL_WIDTH, MODEL_HEIGHT);
 }
-
-cl_mem* MonitoringModelFrame::prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection) {
+uint8_t* MonitoringModelFrame::prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection, cl_mem* output) {
  run_transform(yuv_cl, MODEL_WIDTH, MODEL_HEIGHT, frame_width, frame_height, frame_stride, frame_uv_offset, projection);
+  CL_CHECK(clEnqueueReadBuffer(q, y_cl, CL_TRUE, 0, MODEL_FRAME_SIZE * sizeof(uint8_t), input_frames.get(), 0, nullptr, nullptr));
  clFinish(q);
-  return &y_cl;
+  //return &y_cl;
+  return input_frames.get();
 }

 MonitoringModelFrame::~MonitoringModelFrame() {
@@ -23,12 +23,14 @@ public:
    q = CL_CHECK_ERR(clCreateCommandQueue(context, device_id, 0, &err));
  }
  virtual ~ModelFrame() {}
-  virtual cl_mem* prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection) { return NULL; }
+  virtual uint8_t* prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection, cl_mem* output) { return NULL; }
+  /*
  uint8_t* buffer_from_cl(cl_mem *in_frames, int buffer_size) {
    CL_CHECK(clEnqueueReadBuffer(q, *in_frames, CL_TRUE, 0, buffer_size, input_frames.get(), 0, nullptr, nullptr));
    clFinish(q);
    return &input_frames[0];
  }
+  */

  int MODEL_WIDTH;
  int MODEL_HEIGHT;
@@ -66,7 +68,7 @@ class DrivingModelFrame : public ModelFrame {
 public:
  DrivingModelFrame(cl_device_id device_id, cl_context context);
  ~DrivingModelFrame();
-  cl_mem* prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection);
+  uint8_t* prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection, cl_mem* output);

  const int MODEL_WIDTH = 512;
  const int MODEL_HEIGHT = 256;
@@ -76,7 +78,7 @@ public:

 private:
  LoadYUVState loadyuv;
-  cl_mem img_buffer_20hz_cl, last_img_cl, input_frames_cl;
+  cl_mem img_buffer_20hz_cl, last_img_cl;//, input_frames_cl;
  cl_buffer_region region;
 };

@@ -84,7 +86,7 @@ class MonitoringModelFrame : public ModelFrame {
 public:
  MonitoringModelFrame(cl_device_id device_id, cl_context context);
  ~MonitoringModelFrame();
-  cl_mem* prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection);
+  uint8_t* prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection, cl_mem* output);

  const int MODEL_WIDTH = 1440;
  const int MODEL_HEIGHT = 960;
@@ -92,5 +94,5 @@ public:
  const int buf_size = MODEL_FRAME_SIZE;

 private:
-  cl_mem input_frame_cl;
+  // cl_mem input_frame_cl;
 };
@@ -14,8 +14,8 @@ cdef extern from "common/clutil.h":
 cdef extern from "selfdrive/modeld/models/commonmodel.h":
  cppclass ModelFrame:
    int buf_size
-    unsigned char * buffer_from_cl(cl_mem*, int);
-    cl_mem * prepare(cl_mem, int, int, int, int, mat3)
+    # unsigned char * buffer_from_cl(cl_mem*, int);
+    unsigned char * prepare(cl_mem, int, int, int, int, mat3, cl_mem*)

  cppclass DrivingModelFrame:
    int buf_size
@@ -39,17 +39,24 @@ cdef class ModelFrame:
  def __dealloc__(self):
    del self.frame

-  def prepare(self, VisionBuf buf, float[:] projection):
+  def prepare(self, VisionBuf buf, float[:] projection, CLMem output):
    cdef mat3 cprojection
    memcpy(cprojection.v, &projection[0], 9*sizeof(float))
-    cdef cl_mem * data
-    data = self.frame.prepare(buf.buf.buf_cl, buf.width, buf.height, buf.stride, buf.uv_offset, cprojection)
-    return CLMem.create(data)
+    cdef unsigned char * data
+    if output is None:
+      data = self.frame.prepare(buf.buf.buf_cl, buf.width, buf.height, buf.stride, buf.uv_offset, cprojection, NULL)
+    else:
+      data = self.frame.prepare(buf.buf.buf_cl, buf.width, buf.height, buf.stride, buf.uv_offset, cprojection, output.mem)
+    if not data:
+      return None

-  def buffer_from_cl(self, CLMem in_frames):
-    cdef unsigned char * data2
-    data2 = self.frame.buffer_from_cl(in_frames.mem, self.buf_size)
-    return np.asarray(<cnp.uint8_t[:self.buf_size]> data2)
+    return np.asarray(<cnp.uint8_t[:self.buf_size]> data)
+    # return CLMem.create(data)
+
+  # def buffer_from_cl(self, CLMem in_frames):
+  #   cdef unsigned char * data2
+  #   data2 = self.frame.buffer_from_cl(in_frames.mem, self.buf_size)
+  #   return np.asarray(<cnp.uint8_t[:self.buf_size]> data2)


 cdef class DrivingModelFrame(ModelFrame):
@@ -67,4 +74,3 @@ cdef class MonitoringModelFrame(ModelFrame):
    self._frame = new cppMonitoringModelFrame(context.device_id, context.context)
    self.frame = <cppModelFrame*>(self._frame)
    self.buf_size = self._frame.buf_size
-
@@ -0,0 +1,27 @@
+import os
+from openpilot.system.hardware import TICI
+from openpilot.selfdrive.modeld.runners.runmodel_pyx import RunModel, Runtime
+assert Runtime
+
+USE_THNEED = int(os.getenv('USE_THNEED', str(int(TICI))))
+USE_SNPE = int(os.getenv('USE_SNPE', str(int(TICI))))
+
+class ModelRunner(RunModel):
+  THNEED = 'THNEED'
+  SNPE = 'SNPE'
+  ONNX = 'ONNX'
+
+  def __new__(cls, paths, *args, **kwargs):
+    if ModelRunner.THNEED in paths and USE_THNEED:
+      from openpilot.selfdrive.modeld.runners.thneedmodel_pyx import ThneedModel as Runner
+      runner_type = ModelRunner.THNEED
+    elif ModelRunner.SNPE in paths and USE_SNPE:
+      from openpilot.selfdrive.modeld.runners.snpemodel_pyx import SNPEModel as Runner
+      runner_type = ModelRunner.SNPE
+    elif ModelRunner.ONNX in paths:
+      from openpilot.selfdrive.modeld.runners.onnxmodel import ONNXModel as Runner
+      runner_type = ModelRunner.ONNX
+    else:
+      raise Exception("Couldn't select a model runner, make sure to pass at least one valid model path")
+
+    return Runner(str(paths[runner_type]), *args, **kwargs)
@@ -0,0 +1,71 @@
+import os
+import onnx
+import sys
+import numpy as np
+from typing import Any
+
+from openpilot.selfdrive.modeld.runners.runmodel_pyx import RunModel
+from openpilot.selfdrive.modeld.runners.ort_helpers import convert_fp16_to_fp32, ORT_TYPES_TO_NP_TYPES
+
+
+def create_ort_session(path, fp16_to_fp32):
+  os.environ["OMP_NUM_THREADS"] = "4"
+  os.environ["OMP_WAIT_POLICY"] = "PASSIVE"
+
+  import onnxruntime as ort
+  print("Onnx available providers: ", ort.get_available_providers(), file=sys.stderr)
+  options = ort.SessionOptions()
+  options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_DISABLE_ALL
+
+  provider: str | tuple[str, dict[Any, Any]]
+  if 'OpenVINOExecutionProvider' in ort.get_available_providers() and 'ONNXCPU' not in os.environ:
+    provider = 'OpenVINOExecutionProvider'
+  elif 'CUDAExecutionProvider' in ort.get_available_providers() and 'ONNXCPU' not in os.environ:
+    options.intra_op_num_threads = 2
+    provider = ('CUDAExecutionProvider', {'cudnn_conv_algo_search': 'EXHAUSTIVE'})
+  else:
+    options.intra_op_num_threads = 2
+    options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
+    options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+    provider = 'CPUExecutionProvider'
+
+  model_data = convert_fp16_to_fp32(onnx.load(path)) if fp16_to_fp32 else path
+  print("Onnx selected provider: ", [provider], file=sys.stderr)
+  ort_session = ort.InferenceSession(model_data, options, providers=[provider])
+  print("Onnx using ", ort_session.get_providers(), file=sys.stderr)
+  return ort_session
+
+
+class ONNXModel(RunModel):
+  def __init__(self, path, output, runtime, use_tf8, cl_context):
+    self.inputs = {}
+    self.output = output
+
+    self.session = create_ort_session(path, fp16_to_fp32=True)
+    self.input_names = [x.name for x in self.session.get_inputs()]
+    self.input_shapes = {x.name: [1, *x.shape[1:]] for x in self.session.get_inputs()}
+    self.input_dtypes = {x.name: ORT_TYPES_TO_NP_TYPES[x.type] for x in self.session.get_inputs()}
+
+    # run once to initialize CUDA provider
+    if "CUDAExecutionProvider" in self.session.get_providers():
+      self.session.run(None, {k: np.zeros(self.input_shapes[k], dtype=self.input_dtypes[k]) for k in self.input_names})
+    print("ready to run onnx model", self.input_shapes, file=sys.stderr)
+
+  def addInput(self, name, buffer):
+    assert name in self.input_names
+    self.inputs[name] = buffer
+
+  def setInputBuffer(self, name, buffer):
+    assert name in self.inputs
+    self.inputs[name] = buffer
+
+  def getCLBuffer(self, name):
+    return None
+
+  def execute(self):
+    inputs = {k: v.view(self.input_dtypes[k]) for k,v in self.inputs.items()}
+    inputs = {k: v.reshape(self.input_shapes[k]).astype(self.input_dtypes[k]) for k,v in inputs.items()}
+    outputs = self.session.run(None, inputs)
+    assert len(outputs) == 1, "Only single model outputs are supported"
+    self.output[:] = outputs[0]
+    return self.output
@@ -0,0 +1,4 @@
+#pragma once
+
+#include "selfdrive/modeld/runners/runmodel.h"
+#include "selfdrive/modeld/runners/snpemodel.h"
@@ -0,0 +1,49 @@
+#pragma once
+
+#include <string>
+#include <vector>
+#include <memory>
+#include <cassert>
+
+#include "common/clutil.h"
+#include "common/swaglog.h"
+
+#define USE_CPU_RUNTIME 0
+#define USE_GPU_RUNTIME 1
+#define USE_DSP_RUNTIME 2
+
+struct ModelInput {
+  const std::string name;
+  float *buffer;
+  int size;
+
+  ModelInput(const std::string _name, float *_buffer, int _size) : name(_name), buffer(_buffer), size(_size) {}
+  virtual void setBuffer(float *_buffer, int _size) {
+    assert(size == _size || size == 0);
+    buffer = _buffer;
+    size = _size;
+  }
+};
+
+class RunModel {
+public:
+  std::vector<std::unique_ptr<ModelInput>> inputs;
+
+  virtual ~RunModel() {}
+  virtual void execute() {}
+  virtual void* getCLBuffer(const std::string name) { return nullptr; }
+
+  virtual void addInput(const std::string name, float *buffer, int size) {
+    inputs.push_back(std::unique_ptr<ModelInput>(new ModelInput(name, buffer, size)));
+  }
+  virtual void setInputBuffer(const std::string name, float *buffer, int size) {
+    for (auto &input : inputs) {
+      if (name == input->name) {
+        input->setBuffer(buffer, size);
+        return;
+      }
+    }
+    LOGE("Tried to update input `%s` but no input with this name exists", name.c_str());
+    assert(false);
+  }
+};
@@ -0,0 +1,14 @@
+# distutils: language = c++
+
+from libcpp.string cimport string
+
+cdef extern from "selfdrive/modeld/runners/runmodel.h":
+  cdef int USE_CPU_RUNTIME
+  cdef int USE_GPU_RUNTIME
+  cdef int USE_DSP_RUNTIME
+
+  cdef cppclass RunModel:
+    void addInput(string, float*, int)
+    void setInputBuffer(string, float*, int)
+    void * getCLBuffer(string)
+    void execute()
@@ -0,0 +1,6 @@
+# distutils: language = c++
+
+from .runmodel cimport RunModel as cppRunModel
+
+cdef class RunModel:
+  cdef cppRunModel * model
@@ -0,0 +1,37 @@
+# distutils: language = c++
+# cython: c_string_encoding=ascii, language_level=3
+
+from libcpp.string cimport string
+
+from .runmodel cimport USE_CPU_RUNTIME, USE_GPU_RUNTIME, USE_DSP_RUNTIME
+from selfdrive.modeld.models.commonmodel_pyx cimport CLMem
+
+class Runtime:
+  CPU = USE_CPU_RUNTIME
+  GPU = USE_GPU_RUNTIME
+  DSP = USE_DSP_RUNTIME
+
+cdef class RunModel:
+  def __dealloc__(self):
+    del self.model
+
+  def addInput(self, string name, float[:] buffer):
+    if buffer is not None:
+      self.model.addInput(name, &buffer[0], len(buffer))
+    else:
+      self.model.addInput(name, NULL, 0)
+
+  def setInputBuffer(self, string name, float[:] buffer):
+    if buffer is not None:
+      self.model.setInputBuffer(name, &buffer[0], len(buffer))
+    else:
+      self.model.setInputBuffer(name, NULL, 0)
+
+  def getCLBuffer(self, string name):
+    cdef void * cl_buf = self.model.getCLBuffer(name)
+    if not cl_buf:
+      return None
+    return CLMem.create(cl_buf)
+
+  def execute(self):
+    self.model.execute()
@@ -0,0 +1,116 @@
+#pragma clang diagnostic ignored "-Wexceptions"
+
+#include "selfdrive/modeld/runners/snpemodel.h"
+
+#include <cstring>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "common/util.h"
+#include "common/timing.h"
+
+void PrintErrorStringAndExit() {
+  std::cerr << zdl::DlSystem::getLastErrorString() << std::endl;
+  std::exit(EXIT_FAILURE);
+}
+
+SNPEModel::SNPEModel(const std::string path, float *_output, size_t _output_size, int runtime, bool _use_tf8, cl_context context) {
+  output = _output;
+  output_size = _output_size;
+  use_tf8 = _use_tf8;
+
+#ifdef QCOM2
+  if (runtime == USE_GPU_RUNTIME) {
+    snpe_runtime = zdl::DlSystem::Runtime_t::GPU;
+  } else if (runtime == USE_DSP_RUNTIME) {
+    snpe_runtime = zdl::DlSystem::Runtime_t::DSP;
+  } else {
+    snpe_runtime = zdl::DlSystem::Runtime_t::CPU;
+  }
+  assert(zdl::SNPE::SNPEFactory::isRuntimeAvailable(snpe_runtime));
+#endif
+  model_data = util::read_file(path);
+  assert(model_data.size() > 0);
+
+  // load model
+  std::unique_ptr<zdl::DlContainer::IDlContainer> container = zdl::DlContainer::IDlContainer::open((uint8_t*)model_data.data(), model_data.size());
+  if (!container) { PrintErrorStringAndExit(); }
+  LOGW("loaded model with size: %lu", model_data.size());
+
+  // create model runner
+  zdl::SNPE::SNPEBuilder snpe_builder(container.get());
+  while (!snpe) {
+#ifdef QCOM2
+    snpe = snpe_builder.setOutputLayers({})
+                       .setRuntimeProcessor(snpe_runtime)
+                       .setUseUserSuppliedBuffers(true)
+                       .setPerformanceProfile(zdl::DlSystem::PerformanceProfile_t::HIGH_PERFORMANCE)
+                       .build();
+#else
+    snpe = snpe_builder.setOutputLayers({})
+                       .setUseUserSuppliedBuffers(true)
+                       .setPerformanceProfile(zdl::DlSystem::PerformanceProfile_t::HIGH_PERFORMANCE)
+                       .build();
+#endif
+    if (!snpe) std::cerr << zdl::DlSystem::getLastErrorString() << std::endl;
+  }
+
+  // create output buffer
+  zdl::DlSystem::UserBufferEncodingFloat ub_encoding_float;
+  zdl::DlSystem::IUserBufferFactory &ub_factory = zdl::SNPE::SNPEFactory::getUserBufferFactory();
+
+  const auto &output_tensor_names_opt = snpe->getOutputTensorNames();
+  if (!output_tensor_names_opt) throw std::runtime_error("Error obtaining output tensor names");
+  const auto &output_tensor_names = *output_tensor_names_opt;
+  assert(output_tensor_names.size() == 1);
+  const char *output_tensor_name = output_tensor_names.at(0);
+  const zdl::DlSystem::TensorShape &buffer_shape = snpe->getInputOutputBufferAttributes(output_tensor_name)->getDims();
+  if (output_size != 0) {
+    assert(output_size == buffer_shape[1]);
+  } else {
+    output_size = buffer_shape[1];
+  }
+  std::vector<size_t> output_strides = {output_size * sizeof(float), sizeof(float)};
+  output_buffer = ub_factory.createUserBuffer(output, output_size * sizeof(float), output_strides, &ub_encoding_float);
+  output_map.add(output_tensor_name, output_buffer.get());
+}
+
+void SNPEModel::addInput(const std::string name, float *buffer, int size) {
+  const int idx = inputs.size();
+  const auto &input_tensor_names_opt = snpe->getInputTensorNames();
+  if (!input_tensor_names_opt) throw std::runtime_error("Error obtaining input tensor names");
+  const auto &input_tensor_names = *input_tensor_names_opt;
+  const char *input_tensor_name = input_tensor_names.at(idx);
+  const bool input_tf8 = use_tf8 && strcmp(input_tensor_name, "input_img") == 0;  // TODO: This is a terrible hack, get rid of this name check both here and in onnx_runner.py
+  LOGW("adding index %d: %s", idx, input_tensor_name);
+
+  zdl::DlSystem::UserBufferEncodingFloat ub_encoding_float;
+  zdl::DlSystem::UserBufferEncodingTf8 ub_encoding_tf8(0, 1./255); // network takes 0-1
+  zdl::DlSystem::IUserBufferFactory &ub_factory = zdl::SNPE::SNPEFactory::getUserBufferFactory();
+  zdl::DlSystem::UserBufferEncoding *input_encoding = input_tf8 ? (zdl::DlSystem::UserBufferEncoding*)&ub_encoding_tf8 : (zdl::DlSystem::UserBufferEncoding*)&ub_encoding_float;
+
+  const auto &buffer_shape_opt = snpe->getInputDimensions(input_tensor_name);
+  const zdl::DlSystem::TensorShape &buffer_shape = *buffer_shape_opt;
+  size_t size_of_input = input_tf8 ? sizeof(uint8_t) : sizeof(float);
+  std::vector<size_t> strides(buffer_shape.rank());
+  strides[strides.size() - 1] = size_of_input;
+  size_t product = 1;
+  for (size_t i = 0; i < buffer_shape.rank(); i++) product *= buffer_shape[i];
+  size_t stride = strides[strides.size() - 1];
+  for (size_t i = buffer_shape.rank() - 1; i > 0; i--) {
+    stride *= buffer_shape[i];
+    strides[i-1] = stride;
+  }
+
+  auto input_buffer = ub_factory.createUserBuffer(buffer, product*size_of_input, strides, input_encoding);
+  input_map.add(input_tensor_name, input_buffer.get());
+  inputs.push_back(std::unique_ptr<SNPEModelInput>(new SNPEModelInput(name, buffer, size, std::move(input_buffer))));
+}
+
+void SNPEModel::execute() {
+  if (!snpe->execute(input_map, output_map)) {
+    PrintErrorStringAndExit();
+  }
+}
@@ -0,0 +1,52 @@
+#pragma once
+#pragma clang diagnostic ignored "-Wdeprecated-declarations"
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include <DlContainer/IDlContainer.hpp>
+#include <DlSystem/DlError.hpp>
+#include <DlSystem/ITensor.hpp>
+#include <DlSystem/ITensorFactory.hpp>
+#include <DlSystem/IUserBuffer.hpp>
+#include <DlSystem/IUserBufferFactory.hpp>
+#include <SNPE/SNPE.hpp>
+#include <SNPE/SNPEBuilder.hpp>
+#include <SNPE/SNPEFactory.hpp>
+
+#include "selfdrive/modeld/runners/runmodel.h"
+
+struct SNPEModelInput : public ModelInput {
+  std::unique_ptr<zdl::DlSystem::IUserBuffer> snpe_buffer;
+
+  SNPEModelInput(const std::string _name, float *_buffer, int _size, std::unique_ptr<zdl::DlSystem::IUserBuffer> _snpe_buffer) : ModelInput(_name, _buffer, _size), snpe_buffer(std::move(_snpe_buffer)) {}
+  void setBuffer(float *_buffer, int _size) {
+    ModelInput::setBuffer(_buffer, _size);
+    assert(snpe_buffer->setBufferAddress(_buffer) == true);
+  }
+};
+
+class SNPEModel : public RunModel {
+public:
+  SNPEModel(const std::string path, float *_output, size_t _output_size, int runtime, bool use_tf8 = false, cl_context context = NULL);
+  void addInput(const std::string name, float *buffer, int size);
+  void execute();
+
+private:
+  std::string model_data;
+
+#ifdef QCOM2
+  zdl::DlSystem::Runtime_t snpe_runtime;
+#endif
+
+  // snpe model stuff
+  std::unique_ptr<zdl::SNPE::SNPE> snpe;
+  zdl::DlSystem::UserBufferMap input_map;
+  zdl::DlSystem::UserBufferMap output_map;
+  std::unique_ptr<zdl::DlSystem::IUserBuffer> output_buffer;
+
+  bool use_tf8;
+  float *output;
+  size_t output_size;
+};
@@ -0,0 +1,9 @@
+# distutils: language = c++
+
+from libcpp.string cimport string
+
+from msgq.visionipc.visionipc cimport cl_context
+
+cdef extern from "selfdrive/modeld/runners/snpemodel.h":
+  cdef cppclass SNPEModel:
+    SNPEModel(string, float*, size_t, int, bool, cl_context)
@@ -0,0 +1,17 @@
+# distutils: language = c++
+# cython: c_string_encoding=ascii, language_level=3
+
+import os
+from libcpp cimport bool
+from libcpp.string cimport string
+
+from .snpemodel cimport SNPEModel as cppSNPEModel
+from selfdrive.modeld.models.commonmodel_pyx cimport CLContext
+from selfdrive.modeld.runners.runmodel_pyx cimport RunModel
+from selfdrive.modeld.runners.runmodel cimport RunModel as cppRunModel
+
+os.environ['ADSP_LIBRARY_PATH'] = "/data/pythonpath/third_party/snpe/dsp/"
+
+cdef class SNPEModel(RunModel):
+  def __cinit__(self, string path, float[:] output, int runtime, bool use_tf8, CLContext context):
+    self.model = <cppRunModel *> new cppSNPEModel(path, &output[0], len(output), runtime, use_tf8, context.context)
@@ -0,0 +1,58 @@
+#include "selfdrive/modeld/runners/thneedmodel.h"
+
+#include <string>
+
+#include "common/swaglog.h"
+
+ThneedModel::ThneedModel(const std::string path, float *_output, size_t _output_size, int runtime, bool luse_tf8, cl_context context) {
+  thneed = new Thneed(true, context);
+  thneed->load(path.c_str());
+  thneed->clexec();
+
+  recorded = false;
+  output = _output;
+}
+
+void* ThneedModel::getCLBuffer(const std::string name) {
+  int index = -1;
+  for (int i = 0; i < inputs.size(); i++) {
+    if (name == inputs[i]->name) {
+      index = i;
+      break;
+    }
+  }
+
+  if (index == -1) {
+    LOGE("Tried to get CL buffer for input `%s` but no input with this name exists", name.c_str());
+    assert(false);
+  }
+
+  if (thneed->input_clmem.size() >= inputs.size()) {
+    return &thneed->input_clmem[inputs.size() - index - 1];
+  } else {
+    return nullptr;
+  }
+}
+
+void ThneedModel::execute() {
+  if (!recorded) {
+    thneed->record = true;
+    float *input_buffers[inputs.size()];
+    for (int i = 0; i < inputs.size(); i++) {
+      input_buffers[inputs.size() - i - 1] = inputs[i]->buffer;
+    }
+
+    thneed->copy_inputs(input_buffers);
+    thneed->clexec();
+    thneed->copy_output(output);
+    thneed->stop();
+
+    recorded = true;
+  } else {
+    float *input_buffers[inputs.size()];
+    for (int i = 0; i < inputs.size(); i++) {
+      input_buffers[inputs.size() - i - 1] = inputs[i]->buffer;
+    }
+    thneed->execute(input_buffers, output);
+  }
+}
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <string>
+
+#include "selfdrive/modeld/runners/runmodel.h"
+#include "selfdrive/modeld/thneed/thneed.h"
+
+class ThneedModel : public RunModel {
+public:
+  ThneedModel(const std::string path, float *_output, size_t _output_size, int runtime, bool use_tf8 = false, cl_context context = NULL);
+  void *getCLBuffer(const std::string name);
+  void execute();
+private:
+  Thneed *thneed = NULL;
+  bool recorded;
+  float *output;
+};
@@ -0,0 +1,9 @@
+# distutils: language = c++
+
+from libcpp.string cimport string
+
+from msgq.visionipc.visionipc cimport cl_context
+
+cdef extern from "selfdrive/modeld/runners/thneedmodel.h":
+  cdef cppclass ThneedModel:
+    ThneedModel(string, float*, size_t, int, bool, cl_context)
@@ -0,0 +1,14 @@
+# distutils: language = c++
+# cython: c_string_encoding=ascii, language_level=3
+
+from libcpp cimport bool
+from libcpp.string cimport string
+
+from .thneedmodel cimport ThneedModel as cppThneedModel
+from selfdrive.modeld.models.commonmodel_pyx cimport CLContext
+from selfdrive.modeld.runners.runmodel_pyx cimport RunModel
+from selfdrive.modeld.runners.runmodel cimport RunModel as cppRunModel
+
+cdef class ThneedModel(RunModel):
+  def __cinit__(self, string path, float[:] output, int runtime, bool use_tf8, CLContext context):
+    self.model = <cppRunModel *> new cppThneedModel(path, &output[0], len(output), runtime, use_tf8, context.context)
@@ -1,8 +0,0 @@
-
-from tinygrad.tensor import Tensor
-from tinygrad.helpers import to_mv
-
-def qcom_tensor_from_opencl_address(opencl_address, shape, dtype):
-  cl_buf_desc_ptr = to_mv(opencl_address, 8).cast('Q')[0]
-  rawbuf_ptr = to_mv(cl_buf_desc_ptr, 0x100).cast('Q')[20] # offset 0xA0 is a raw gpu pointer.
-  return Tensor.from_blob(rawbuf_ptr, shape, dtype=dtype, device='QCOM')
@@ -0,0 +1,8 @@
+thneed is an SNPE accelerator. I know SNPE is already an accelerator, but sometimes things need to go even faster..
+
+It runs on the local device, and caches a single model run. Then it replays it, but fast.
+
+thneed slices through abstraction layers like a fish.
+
+You need a thneed.
+
@@ -0,0 +1,154 @@
+#include <cassert>
+#include <set>
+
+#include "third_party/json11/json11.hpp"
+#include "common/util.h"
+#include "common/clutil.h"
+#include "common/swaglog.h"
+#include "selfdrive/modeld/thneed/thneed.h"
+using namespace json11;
+
+extern map<cl_program, string> g_program_source;
+
+void Thneed::load(const char *filename) {
+  LOGD("Thneed::load: loading from %s\n", filename);
+
+  string buf = util::read_file(filename);
+  int jsz = *(int *)buf.data();
+  string jsonerr;
+  string jj(buf.data() + sizeof(int), jsz);
+  Json jdat = Json::parse(jj, jsonerr);
+
+  map<cl_mem, cl_mem> real_mem;
+  real_mem[NULL] = NULL;
+
+  int ptr = sizeof(int)+jsz;
+  for (auto &obj : jdat["objects"].array_items()) {
+    auto mobj = obj.object_items();
+    int sz = mobj["size"].int_value();
+    cl_mem clbuf = NULL;
+
+    if (mobj["buffer_id"].string_value().size() > 0) {
+      // image buffer must already be allocated
+      clbuf = real_mem[*(cl_mem*)(mobj["buffer_id"].string_value().data())];
+      assert(mobj["needs_load"].bool_value() == false);
+    } else {
+      if (mobj["needs_load"].bool_value()) {
+        clbuf = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, sz, &buf[ptr], NULL);
+        if (debug >= 1) printf("loading %p %d @ 0x%X\n", clbuf, sz, ptr);
+        ptr += sz;
+      } else {
+        // TODO: is there a faster way to init zeroed out buffers?
+        void *host_zeros = calloc(sz, 1);
+        clbuf = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, sz, host_zeros, NULL);
+        free(host_zeros);
+      }
+    }
+    assert(clbuf != NULL);
+
+    if (mobj["arg_type"] == "image2d_t" || mobj["arg_type"] == "image1d_t") {
+      cl_image_desc desc = {0};
+      desc.image_type = (mobj["arg_type"] == "image2d_t") ? CL_MEM_OBJECT_IMAGE2D : CL_MEM_OBJECT_IMAGE1D_BUFFER;
+      desc.image_width = mobj["width"].int_value();
+      desc.image_height = mobj["height"].int_value();
+      desc.image_row_pitch = mobj["row_pitch"].int_value();
+      assert(sz == desc.image_height*desc.image_row_pitch);
+#ifdef QCOM2
+      desc.buffer = clbuf;
+#else
+      // TODO: we are creating unused buffers on PC
+      clReleaseMemObject(clbuf);
+#endif
+      cl_image_format format = {0};
+      format.image_channel_order = CL_RGBA;
+      format.image_channel_data_type = mobj["float32"].bool_value() ? CL_FLOAT : CL_HALF_FLOAT;
+
+      cl_int errcode;
+
+#ifndef QCOM2
+      if (mobj["needs_load"].bool_value()) {
+        clbuf = clCreateImage(context, CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, &format, &desc, &buf[ptr-sz], &errcode);
+      } else {
+        clbuf = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, &errcode);
+      }
+#else
+      clbuf = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, &errcode);
+#endif
+      if (clbuf == NULL) {
+        LOGE("clError: %s create image %zux%zu rp %zu with buffer %p\n", cl_get_error_string(errcode),
+             desc.image_width, desc.image_height, desc.image_row_pitch, desc.buffer);
+      }
+      assert(clbuf != NULL);
+    }
+
+    real_mem[*(cl_mem*)(mobj["id"].string_value().data())] = clbuf;
+  }
+
+  map<string, cl_program> g_programs;
+  for (const auto &[name, source] : jdat["programs"].object_items()) {
+    if (debug >= 1) printf("building %s with size %zu\n", name.c_str(), source.string_value().size());
+    g_programs[name] = cl_program_from_source(context, device_id, source.string_value());
+  }
+
+  for (auto &obj : jdat["inputs"].array_items()) {
+    auto mobj = obj.object_items();
+    int sz = mobj["size"].int_value();
+    cl_mem aa = real_mem[*(cl_mem*)(mobj["buffer_id"].string_value().data())];
+    input_clmem.push_back(aa);
+    input_sizes.push_back(sz);
+    LOGD("Thneed::load: adding input %s with size %d\n", mobj["name"].string_value().data(), sz);
+
+    cl_int cl_err;
+    void *ret = clEnqueueMapBuffer(command_queue, aa, CL_TRUE, CL_MAP_WRITE, 0, sz, 0, NULL, NULL, &cl_err);
+    if (cl_err != CL_SUCCESS) LOGE("clError: %s map %p %d\n", cl_get_error_string(cl_err), aa, sz);
+    assert(cl_err == CL_SUCCESS);
+    inputs.push_back(ret);
+  }
+
+  for (auto &obj : jdat["outputs"].array_items()) {
+    auto mobj = obj.object_items();
+    int sz = mobj["size"].int_value();
+    LOGD("Thneed::save: adding output with size %d\n", sz);
+    // TODO: support multiple outputs
+    output = real_mem[*(cl_mem*)(mobj["buffer_id"].string_value().data())];
+    assert(output != NULL);
+  }
+
+  for (auto &obj : jdat["binaries"].array_items()) {
+    string name = obj["name"].string_value();
+    size_t length = obj["length"].int_value();
+    if (debug >= 1) printf("binary %s with size %zu\n", name.c_str(), length);
+    g_programs[name] = cl_program_from_binary(context, device_id, (const uint8_t*)&buf[ptr], length);
+    ptr += length;
+  }
+
+  for (auto &obj : jdat["kernels"].array_items()) {
+    auto gws = obj["global_work_size"];
+    auto lws = obj["local_work_size"];
+    auto kk = shared_ptr<CLQueuedKernel>(new CLQueuedKernel(this));
+
+    kk->name = obj["name"].string_value();
+    kk->program = g_programs[kk->name];
+    kk->work_dim = obj["work_dim"].int_value();
+    for (int i = 0; i < kk->work_dim; i++) {
+      kk->global_work_size[i] = gws[i].int_value();
+      kk->local_work_size[i] = lws[i].int_value();
+    }
+    kk->num_args = obj["num_args"].int_value();
+    for (int i = 0; i < kk->num_args; i++) {
+      string arg = obj["args"].array_items()[i].string_value();
+      int arg_size = obj["args_size"].array_items()[i].int_value();
+      kk->args_size.push_back(arg_size);
+      if (arg_size == 8) {
+        cl_mem val = *(cl_mem*)(arg.data());
+        val = real_mem[val];
+        kk->args.push_back(string((char*)&val, sizeof(val)));
+      } else {
+        kk->args.push_back(arg);
+      }
+    }
+    kq.push_back(kk);
+  }
+
+  clFinish(command_queue);
+}
@@ -0,0 +1,133 @@
+#pragma once
+
+#ifndef __user
+#define __user __attribute__(())
+#endif
+
+#include <cstdint>
+#include <cstdlib>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <CL/cl.h>
+
+#include "third_party/linux/include/msm_kgsl.h"
+
+using namespace std;
+
+cl_int thneed_clSetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value);
+
+namespace json11 {
+  class Json;
+}
+class Thneed;
+
+class GPUMalloc {
+  public:
+    GPUMalloc(int size, int fd);
+    ~GPUMalloc();
+    void *alloc(int size);
+  private:
+    uint64_t base;
+    int remaining;
+};
+
+class CLQueuedKernel {
+  public:
+    CLQueuedKernel(Thneed *lthneed) { thneed = lthneed; }
+    CLQueuedKernel(Thneed *lthneed,
+                   cl_kernel _kernel,
+                   cl_uint _work_dim,
+                   const size_t *_global_work_size,
+                   const size_t *_local_work_size);
+    cl_int exec();
+    void debug_print(bool verbose);
+    int get_arg_num(const char *search_arg_name);
+    cl_program program;
+    string name;
+    cl_uint num_args;
+    vector<string> arg_names;
+    vector<string> arg_types;
+    vector<string> args;
+    vector<int> args_size;
+    cl_kernel kernel = NULL;
+    json11::Json to_json() const;
+
+    cl_uint work_dim;
+    size_t global_work_size[3] = {0};
+    size_t local_work_size[3] = {0};
+  private:
+    Thneed *thneed;
+};
+
+class CachedIoctl {
+  public:
+    virtual void exec() {}
+};
+
+class CachedSync: public CachedIoctl {
+  public:
+    CachedSync(Thneed *lthneed, string ldata) { thneed = lthneed; data = ldata; }
+    void exec();
+  private:
+    Thneed *thneed;
+    string data;
+};
+
+class CachedCommand: public CachedIoctl {
+  public:
+    CachedCommand(Thneed *lthneed, struct kgsl_gpu_command *cmd);
+    void exec();
+  private:
+    void disassemble(int cmd_index);
+    struct kgsl_gpu_command cache;
+    unique_ptr<kgsl_command_object[]> cmds;
+    unique_ptr<kgsl_command_object[]> objs;
+    Thneed *thneed;
+    vector<shared_ptr<CLQueuedKernel> > kq;
+};
+
+class Thneed {
+  public:
+    Thneed(bool do_clinit=false, cl_context _context = NULL);
+    void stop();
+    void execute(float **finputs, float *foutput, bool slow=false);
+    void wait();
+
+    vector<cl_mem> input_clmem;
+    vector<void *> inputs;
+    vector<size_t> input_sizes;
+    cl_mem output = NULL;
+
+    cl_context context = NULL;
+    cl_command_queue command_queue;
+    cl_device_id device_id;
+    int context_id;
+
+    // protected?
+    bool record = false;
+    int debug;
+    int timestamp;
+
+#ifdef QCOM2
+    unique_ptr<GPUMalloc> ram;
+    vector<unique_ptr<CachedIoctl> > cmds;
+    int fd;
+#endif
+
+    // all CL kernels
+    void copy_inputs(float **finputs, bool internal=false);
+    void copy_output(float *foutput);
+    cl_int clexec();
+    vector<shared_ptr<CLQueuedKernel> > kq;
+
+    // pending CL kernels
+    vector<shared_ptr<CLQueuedKernel> > ckq;
+
+    // loading
+    void load(const char *filename);
+  private:
+    void clinit();
+};
+
@@ -0,0 +1,216 @@
+#include "selfdrive/modeld/thneed/thneed.h"
+
+#include <cassert>
+#include <cstring>
+#include <map>
+
+#include "common/clutil.h"
+#include "common/timing.h"
+
+map<pair<cl_kernel, int>, string> g_args;
+map<pair<cl_kernel, int>, int> g_args_size;
+map<cl_program, string> g_program_source;
+
+void Thneed::stop() {
+  //printf("Thneed::stop: recorded %lu commands\n", cmds.size());
+  record = false;
+}
+
+void Thneed::clinit() {
+  device_id = cl_get_device_id(CL_DEVICE_TYPE_DEFAULT);
+  if (context == NULL) context = CL_CHECK_ERR(clCreateContext(NULL, 1, &device_id, NULL, NULL, &err));
+  //cl_command_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
+  cl_command_queue_properties props[3] = {CL_QUEUE_PROPERTIES, 0, 0};
+  command_queue = CL_CHECK_ERR(clCreateCommandQueueWithProperties(context, device_id, props, &err));
+  printf("Thneed::clinit done\n");
+}
+
+cl_int Thneed::clexec() {
+  if (debug >= 1) printf("Thneed::clexec: running %lu queued kernels\n", kq.size());
+  for (auto &k : kq) {
+    if (record) ckq.push_back(k);
+    cl_int ret = k->exec();
+    assert(ret == CL_SUCCESS);
+  }
+  return clFinish(command_queue);
+}
+
+void Thneed::copy_inputs(float **finputs, bool internal) {
+  for (int idx = 0; idx < inputs.size(); ++idx) {
+    if (debug >= 1) printf("copying %lu -- %p -> %p (cl %p)\n", input_sizes[idx], finputs[idx], inputs[idx], input_clmem[idx]);
+
+    if (internal) {
+      // if it's internal, using memcpy is fine since the buffer sync is cached in the ioctl layer
+      if (finputs[idx] != NULL) memcpy(inputs[idx], finputs[idx], input_sizes[idx]);
+    } else {
+      if (finputs[idx] != NULL) CL_CHECK(clEnqueueWriteBuffer(command_queue, input_clmem[idx], CL_TRUE, 0, input_sizes[idx], finputs[idx], 0, NULL, NULL));
+    }
+  }
+}
+
+void Thneed::copy_output(float *foutput) {
+  if (output != NULL) {
+    size_t sz;
+    clGetMemObjectInfo(output, CL_MEM_SIZE, sizeof(sz), &sz, NULL);
+    if (debug >= 1) printf("copying %lu for output %p -> %p\n", sz, output, foutput);
+    CL_CHECK(clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, sz, foutput, 0, NULL, NULL));
+  } else {
+    printf("CAUTION: model output is NULL, does it have no outputs?\n");
+  }
+}
+
+// *********** CLQueuedKernel ***********
+
+CLQueuedKernel::CLQueuedKernel(Thneed *lthneed,
+                               cl_kernel _kernel,
+                               cl_uint _work_dim,
+                               const size_t *_global_work_size,
+                               const size_t *_local_work_size) {
+  thneed = lthneed;
+  kernel = _kernel;
+  work_dim = _work_dim;
+  assert(work_dim <= 3);
+  for (int i = 0; i < work_dim; i++) {
+    global_work_size[i] = _global_work_size[i];
+    local_work_size[i] = _local_work_size[i];
+  }
+
+  char _name[0x100];
+  clGetKernelInfo(kernel, CL_KERNEL_FUNCTION_NAME, sizeof(_name), _name, NULL);
+  name = string(_name);
+  clGetKernelInfo(kernel, CL_KERNEL_NUM_ARGS, sizeof(num_args), &num_args, NULL);
+
+  // get args
+  for (int i = 0; i < num_args; i++) {
+    char arg_name[0x100] = {0};
+    clGetKernelArgInfo(kernel, i, CL_KERNEL_ARG_NAME, sizeof(arg_name), arg_name, NULL);
+    arg_names.push_back(string(arg_name));
+    clGetKernelArgInfo(kernel, i, CL_KERNEL_ARG_TYPE_NAME, sizeof(arg_name), arg_name, NULL);
+    arg_types.push_back(string(arg_name));
+
+    args.push_back(g_args[make_pair(kernel, i)]);
+    args_size.push_back(g_args_size[make_pair(kernel, i)]);
+  }
+
+  // get program
+  clGetKernelInfo(kernel, CL_KERNEL_PROGRAM, sizeof(program), &program, NULL);
+}
+
+int CLQueuedKernel::get_arg_num(const char *search_arg_name) {
+  for (int i = 0; i < num_args; i++) {
+    if (arg_names[i] == search_arg_name) return i;
+  }
+  printf("failed to find %s in %s\n", search_arg_name, name.c_str());
+  assert(false);
+}
+
+cl_int CLQueuedKernel::exec() {
+  if (kernel == NULL) {
+    kernel = clCreateKernel(program, name.c_str(), NULL);
+    arg_names.clear();
+    arg_types.clear();
+
+    for (int j = 0; j < num_args; j++) {
+      char arg_name[0x100] = {0};
+      clGetKernelArgInfo(kernel, j, CL_KERNEL_ARG_NAME, sizeof(arg_name), arg_name, NULL);
+      arg_names.push_back(string(arg_name));
+      clGetKernelArgInfo(kernel, j, CL_KERNEL_ARG_TYPE_NAME, sizeof(arg_name), arg_name, NULL);
+      arg_types.push_back(string(arg_name));
+
+      cl_int ret;
+      if (args[j].size() != 0) {
+        assert(args[j].size() == args_size[j]);
+        ret = thneed_clSetKernelArg(kernel, j, args[j].size(), args[j].data());
+      } else {
+        ret = thneed_clSetKernelArg(kernel, j, args_size[j], NULL);
+      }
+      assert(ret == CL_SUCCESS);
+    }
+  }
+
+  if (thneed->debug >= 1) {
+    debug_print(thneed->debug >= 2);
+  }
+
+  return clEnqueueNDRangeKernel(thneed->command_queue,
+    kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, NULL);
+}
+
+void CLQueuedKernel::debug_print(bool verbose) {
+  printf("%p %56s -- ", kernel, name.c_str());
+  for (int i = 0; i < work_dim; i++) {
+    printf("%4zu ", global_work_size[i]);
+  }
+  printf(" -- ");
+  for (int i = 0; i < work_dim; i++) {
+    printf("%4zu ", local_work_size[i]);
+  }
+  printf("\n");
+
+  if (verbose) {
+    for (int i = 0; i < num_args; i++) {
+      string arg = args[i];
+      printf("  %s %s", arg_types[i].c_str(), arg_names[i].c_str());
+      void *arg_value = (void*)arg.data();
+      int arg_size = arg.size();
+      if (arg_size == 0) {
+        printf(" (size) %d", args_size[i]);
+      } else if (arg_size == 1) {
+        printf(" = %d", *((char*)arg_value));
+      } else if (arg_size == 2) {
+        printf(" = %d", *((short*)arg_value));
+      } else if (arg_size == 4) {
+        if (arg_types[i] == "float") {
+          printf(" = %f", *((float*)arg_value));
+        } else {
+          printf(" = %d", *((int*)arg_value));
+        }
+      } else if (arg_size == 8) {
+        cl_mem val = (cl_mem)(*((uintptr_t*)arg_value));
+        printf(" = %p", val);
+        if (val != NULL) {
+          cl_mem_object_type obj_type;
+          clGetMemObjectInfo(val, CL_MEM_TYPE, sizeof(obj_type), &obj_type, NULL);
+          if (arg_types[i] == "image2d_t" || arg_types[i] == "image1d_t" || obj_type == CL_MEM_OBJECT_IMAGE2D) {
+            cl_image_format format;
+            size_t width, height, depth, array_size, row_pitch, slice_pitch;
+            cl_mem buf;
+            clGetImageInfo(val, CL_IMAGE_FORMAT, sizeof(format), &format, NULL);
+            assert(format.image_channel_order == CL_RGBA);
+            assert(format.image_channel_data_type == CL_HALF_FLOAT || format.image_channel_data_type == CL_FLOAT);
+            clGetImageInfo(val, CL_IMAGE_WIDTH, sizeof(width), &width, NULL);
+            clGetImageInfo(val, CL_IMAGE_HEIGHT, sizeof(height), &height, NULL);
+            clGetImageInfo(val, CL_IMAGE_ROW_PITCH, sizeof(row_pitch), &row_pitch, NULL);
+            clGetImageInfo(val, CL_IMAGE_DEPTH, sizeof(depth), &depth, NULL);
+            clGetImageInfo(val, CL_IMAGE_ARRAY_SIZE, sizeof(array_size), &array_size, NULL);
+            clGetImageInfo(val, CL_IMAGE_SLICE_PITCH, sizeof(slice_pitch), &slice_pitch, NULL);
+            assert(depth == 0);
+            assert(array_size == 0);
+            assert(slice_pitch == 0);
+
+            clGetImageInfo(val, CL_IMAGE_BUFFER, sizeof(buf), &buf, NULL);
+            size_t sz = 0;
+            if (buf != NULL) clGetMemObjectInfo(buf, CL_MEM_SIZE, sizeof(sz), &sz, NULL);
+            printf(" image %zu x %zu rp %zu @ %p buffer %zu", width, height, row_pitch, buf, sz);
+          } else {
+            size_t sz;
+            clGetMemObjectInfo(val, CL_MEM_SIZE, sizeof(sz), &sz, NULL);
+            printf(" buffer %zu", sz);
+          }
+        }
+      }
+      printf("\n");
+    }
+  }
+}
+
+cl_int thneed_clSetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value) {
+  g_args_size[make_pair(kernel, arg_index)] = arg_size;
+  if (arg_value != NULL) {
+    g_args[make_pair(kernel, arg_index)] = string((char*)arg_value, arg_size);
+  } else {
+    g_args[make_pair(kernel, arg_index)] = string("");
+  }
+  cl_int ret = clSetKernelArg(kernel, arg_index, arg_size, arg_value);
+  return ret;
+}
@@ -0,0 +1,32 @@
+#include "selfdrive/modeld/thneed/thneed.h"
+
+#include <cassert>
+
+#include "common/clutil.h"
+#include "common/timing.h"
+
+Thneed::Thneed(bool do_clinit, cl_context _context) {
+  context = _context;
+  if (do_clinit) clinit();
+  char *thneed_debug_env = getenv("THNEED_DEBUG");
+  debug = (thneed_debug_env != NULL) ? atoi(thneed_debug_env) : 0;
+}
+
+void Thneed::execute(float **finputs, float *foutput, bool slow) {
+  uint64_t tb, te;
+  if (debug >= 1) tb = nanos_since_boot();
+
+  // ****** copy inputs
+  copy_inputs(finputs);
+
+  // ****** run commands
+  clexec();
+
+  // ****** copy outputs
+  copy_output(foutput);
+
+  if (debug >= 1) {
+    te = nanos_since_boot();
+    printf("model exec in %lu us\n", (te-tb)/1000);
+  }
+}
@@ -0,0 +1,258 @@
+#include "selfdrive/modeld/thneed/thneed.h"
+
+#include <dlfcn.h>
+#include <sys/mman.h>
+
+#include <cassert>
+#include <cerrno>
+#include <cstring>
+#include <map>
+#include <string>
+
+#include "common/clutil.h"
+#include "common/timing.h"
+
+Thneed *g_thneed = NULL;
+int g_fd = -1;
+
+void hexdump(uint8_t *d, int len) {
+  assert((len%4) == 0);
+  printf("  dumping %p len 0x%x\n", d, len);
+  for (int i = 0; i < len/4; i++) {
+    if (i != 0 && (i%0x10) == 0) printf("\n");
+    printf("%8x ", d[i]);
+  }
+  printf("\n");
+}
+
+// *********** ioctl interceptor ***********
+
+extern "C" {
+
+int (*my_ioctl)(int filedes, unsigned long request, void *argp) = NULL;
+#undef ioctl
+int ioctl(int filedes, unsigned long request, void *argp) {
+  request &= 0xFFFFFFFF;  // needed on QCOM2
+  if (my_ioctl == NULL) my_ioctl = reinterpret_cast<decltype(my_ioctl)>(dlsym(RTLD_NEXT, "ioctl"));
+  Thneed *thneed = g_thneed;
+
+  // save the fd
+  if (request == IOCTL_KGSL_GPUOBJ_ALLOC) g_fd = filedes;
+
+  // note that this runs always, even without a thneed object
+  if (request == IOCTL_KGSL_DRAWCTXT_CREATE) {
+    struct kgsl_drawctxt_create *create = (struct kgsl_drawctxt_create *)argp;
+    create->flags &= ~KGSL_CONTEXT_PRIORITY_MASK;
+    create->flags |= 6 << KGSL_CONTEXT_PRIORITY_SHIFT;   // priority from 1-15, 1 is max priority
+    printf("IOCTL_KGSL_DRAWCTXT_CREATE: creating context with flags 0x%x\n", create->flags);
+  }
+
+  if (thneed != NULL) {
+    if (request == IOCTL_KGSL_GPU_COMMAND) {
+      struct kgsl_gpu_command *cmd = (struct kgsl_gpu_command *)argp;
+      if (thneed->record) {
+        thneed->timestamp = cmd->timestamp;
+        thneed->context_id = cmd->context_id;
+        thneed->cmds.push_back(unique_ptr<CachedCommand>(new CachedCommand(thneed, cmd)));
+      }
+      if (thneed->debug >= 1) {
+        printf("IOCTL_KGSL_GPU_COMMAND(%2zu): flags: 0x%lx    context_id: %u  timestamp: %u  numcmds: %d  numobjs: %d\n",
+            thneed->cmds.size(),
+            cmd->flags,
+            cmd->context_id, cmd->timestamp, cmd->numcmds, cmd->numobjs);
+      }
+    } else if (request == IOCTL_KGSL_GPUOBJ_SYNC) {
+      struct kgsl_gpuobj_sync *cmd = (struct kgsl_gpuobj_sync *)argp;
+      struct kgsl_gpuobj_sync_obj *objs = (struct kgsl_gpuobj_sync_obj *)(cmd->objs);
+
+      if (thneed->debug >= 2) {
+        printf("IOCTL_KGSL_GPUOBJ_SYNC count:%d ", cmd->count);
+        for (int i = 0; i < cmd->count; i++) {
+          printf(" -- offset:0x%lx len:0x%lx id:%d op:%d  ", objs[i].offset, objs[i].length, objs[i].id, objs[i].op);
+        }
+        printf("\n");
+      }
+
+      if (thneed->record) {
+        thneed->cmds.push_back(unique_ptr<CachedSync>(new
+              CachedSync(thneed, string((char *)objs, sizeof(struct kgsl_gpuobj_sync_obj)*cmd->count))));
+      }
+    } else if (request == IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID) {
+      struct kgsl_device_waittimestamp_ctxtid *cmd = (struct kgsl_device_waittimestamp_ctxtid *)argp;
+      if (thneed->debug >= 1) {
+        printf("IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID: context_id: %d  timestamp: %d  timeout: %d\n",
+            cmd->context_id, cmd->timestamp, cmd->timeout);
+      }
+    } else if (request == IOCTL_KGSL_SETPROPERTY) {
+      if (thneed->debug >= 1) {
+        struct kgsl_device_getproperty *prop = (struct kgsl_device_getproperty *)argp;
+        printf("IOCTL_KGSL_SETPROPERTY: 0x%x sizebytes:%zu\n", prop->type, prop->sizebytes);
+        if (thneed->debug >= 2) {
+          hexdump((uint8_t *)prop->value, prop->sizebytes);
+          if (prop->type == KGSL_PROP_PWR_CONSTRAINT) {
+            struct kgsl_device_constraint *constraint = (struct kgsl_device_constraint *)prop->value;
+            hexdump((uint8_t *)constraint->data, constraint->size);
+          }
+        }
+      }
+    } else if (request == IOCTL_KGSL_DRAWCTXT_CREATE || request == IOCTL_KGSL_DRAWCTXT_DESTROY) {
+      // this happens
+    } else if (request == IOCTL_KGSL_GPUOBJ_ALLOC || request == IOCTL_KGSL_GPUOBJ_FREE) {
+      // this happens
+    } else {
+      if (thneed->debug >= 1) {
+        printf("other ioctl %lx\n", request);
+      }
+    }
+  }
+
+  int ret = my_ioctl(filedes, request, argp);
+  // NOTE: This error message goes into stdout and messes up pyenv
+  // if (ret != 0) printf("ioctl returned %d with errno %d\n", ret, errno);
+  return ret;
+}
+
+}
+
+// *********** GPUMalloc ***********
+
+GPUMalloc::GPUMalloc(int size, int fd) {
+  struct kgsl_gpuobj_alloc alloc;
+  memset(&alloc, 0, sizeof(alloc));
+  alloc.size = size;
+  alloc.flags = 0x10000a00;
+  ioctl(fd, IOCTL_KGSL_GPUOBJ_ALLOC, &alloc);
+  void *addr = mmap64(NULL, alloc.mmapsize, 0x3, 0x1, fd, alloc.id*0x1000);
+  assert(addr != MAP_FAILED);
+
+  base = (uint64_t)addr;
+  remaining = size;
+}
+
+GPUMalloc::~GPUMalloc() {
+  // TODO: free the GPU malloced area
+}
+
+void *GPUMalloc::alloc(int size) {
+  void *ret = (void*)base;
+  size = (size+0xff) & (~0xFF);
+  assert(size <= remaining);
+  remaining -= size;
+  base += size;
+  return ret;
+}
+
+// *********** CachedSync, at the ioctl layer ***********
+
+void CachedSync::exec() {
+  struct kgsl_gpuobj_sync cmd;
+
+  cmd.objs = (uint64_t)data.data();
+  cmd.obj_len = data.length();
+  cmd.count = data.length() / sizeof(struct kgsl_gpuobj_sync_obj);
+
+  int ret = ioctl(thneed->fd, IOCTL_KGSL_GPUOBJ_SYNC, &cmd);
+  assert(ret == 0);
+}
+
+// *********** CachedCommand, at the ioctl layer ***********
+
+CachedCommand::CachedCommand(Thneed *lthneed, struct kgsl_gpu_command *cmd) {
+  thneed = lthneed;
+  assert(cmd->numsyncs == 0);
+
+  memcpy(&cache, cmd, sizeof(cache));
+
+  if (cmd->numcmds > 0) {
+    cmds = make_unique<struct kgsl_command_object[]>(cmd->numcmds);
+    memcpy(cmds.get(), (void *)cmd->cmdlist, sizeof(struct kgsl_command_object)*cmd->numcmds);
+    cache.cmdlist = (uint64_t)cmds.get();
+    for (int i = 0; i < cmd->numcmds; i++) {
+      void *nn = thneed->ram->alloc(cmds[i].size);
+      memcpy(nn, (void*)cmds[i].gpuaddr, cmds[i].size);
+      cmds[i].gpuaddr = (uint64_t)nn;
+    }
+  }
+
+  if (cmd->numobjs > 0) {
+    objs = make_unique<struct kgsl_command_object[]>(cmd->numobjs);
+    memcpy(objs.get(), (void *)cmd->objlist, sizeof(struct kgsl_command_object)*cmd->numobjs);
+    cache.objlist = (uint64_t)objs.get();
+    for (int i = 0; i < cmd->numobjs; i++) {
+      void *nn = thneed->ram->alloc(objs[i].size);
+      memset(nn, 0, objs[i].size);
+      objs[i].gpuaddr = (uint64_t)nn;
+    }
+  }
+
+  kq = thneed->ckq;
+  thneed->ckq.clear();
+}
+
+void CachedCommand::exec() {
+  cache.timestamp = ++thneed->timestamp;
+  int ret = ioctl(thneed->fd, IOCTL_KGSL_GPU_COMMAND, &cache);
+
+  if (thneed->debug >= 1) printf("CachedCommand::exec got %d\n", ret);
+
+  if (thneed->debug >= 2) {
+    for (auto &it : kq) {
+      it->debug_print(false);
+    }
+  }
+
+  assert(ret == 0);
+}
+
+// *********** Thneed ***********
+
+Thneed::Thneed(bool do_clinit, cl_context _context) {
+  // TODO: QCOM2 actually requires a different context
+  //context = _context;
+  if (do_clinit) clinit();
+  assert(g_fd != -1);
+  fd = g_fd;
+  ram = make_unique<GPUMalloc>(0x80000, fd);
+  timestamp = -1;
+  g_thneed = this;
+  char *thneed_debug_env = getenv("THNEED_DEBUG");
+  debug = (thneed_debug_env != NULL) ? atoi(thneed_debug_env) : 0;
+}
+
+void Thneed::wait() {
+  struct kgsl_device_waittimestamp_ctxtid wait;
+  wait.context_id = context_id;
+  wait.timestamp = timestamp;
+  wait.timeout = -1;
+
+  uint64_t tb = nanos_since_boot();
+  int wret = ioctl(fd, IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID, &wait);
+  uint64_t te = nanos_since_boot();
+
+  if (debug >= 1) printf("wait %d after %lu us\n", wret, (te-tb)/1000);
+}
+
+void Thneed::execute(float **finputs, float *foutput, bool slow) {
+  uint64_t tb, te;
+  if (debug >= 1) tb = nanos_since_boot();
+
+  // ****** copy inputs
+  copy_inputs(finputs, true);
+
+  // ****** run commands
+  int i = 0;
+  for (auto &it : cmds) {
+    ++i;
+    if (debug >= 1) printf("run %2d @ %7lu us: ", i, (nanos_since_boot()-tb)/1000);
+    it->exec();
+    if ((i == cmds.size()) || slow) wait();
+  }
+
+  // ****** copy outputs
+  copy_output(foutput);
+
+  if (debug >= 1) {
+    te = nanos_since_boot();
+    printf("model exec in %lu us\n", (te-tb)/1000);
+  }
+}
@@ -137,8 +137,8 @@ void Panda::enable_deepsleep() {
  handle->control_write(0xfb, 0, 0);
 }

-void Panda::send_heartbeat(bool engaged) {
-  handle->control_write(0xf3, engaged, 0);
+void Panda::send_heartbeat(bool engaged, bool engaged_mads) {
+  handle->control_write(0xf3, engaged, engaged_mads);
 }

 void Panda::set_can_speed_kbps(uint16_t bus, uint16_t speed) {
@@ -75,7 +75,7 @@ public:
  std::optional<std::string> get_serial();
  void set_power_saving(bool power_saving);
  void enable_deepsleep();
-  void send_heartbeat(bool engaged);
+  void send_heartbeat(bool engaged, bool engaged_mads);
  void set_can_speed_kbps(uint16_t bus, uint16_t speed);
  void set_can_fd_auto(uint16_t bus, bool enabled);
  void set_data_speed_kbps(uint16_t bus, uint16_t speed);
@@ -41,6 +41,8 @@
 #define CUTOFF_IL 400
 #define SATURATE_IL 1000

+#define ALT_EXP_DISENGAGE_LATERAL_ON_BRAKE 2048
+
 ExitHandler do_exit;

 bool check_all_connected(const std::vector<Panda *> &pandas) {
@@ -53,6 +55,18 @@ bool check_all_connected(const std::vector<Panda *> &pandas) {
  return true;
 }

+bool process_mads_heartbeat(SubMaster *sm) {
+  const int &alt_exp = (*sm)["carParams"].getCarParams().getAlternativeExperience();
+  const bool disengage_lateral_on_brake = (alt_exp & ALT_EXP_DISENGAGE_LATERAL_ON_BRAKE) != 0;
+
+  const auto &mads = (*sm)["selfdriveStateSP"].getSelfdriveStateSP().getMads();
+  const bool heartbeat_type = disengage_lateral_on_brake ? mads.getActive() : mads.getEnabled();
+
+  const bool engaged = sm->allAliveAndValid({"selfdriveStateSP"}) && heartbeat_type;
+
+  return engaged;
+}
+
 Panda *connect(std::string serial="", uint32_t index=0) {
  std::unique_ptr<Panda> panda;
  try {
@@ -144,6 +158,7 @@ void fill_panda_state(cereal::PandaState::Builder &ps, cereal::PandaState::Panda
  ps.setIgnitionLine(health.ignition_line_pkt);
  ps.setIgnitionCan(health.ignition_can_pkt);
  ps.setControlsAllowed(health.controls_allowed_pkt);
+  ps.setControlsAllowedLat(health.controls_allowed_lat_pkt);
  ps.setTxBufferOverflow(health.tx_buffer_overflow_pkt);
  ps.setRxBufferOverflow(health.rx_buffer_overflow_pkt);
  ps.setPandaType(hw_type);
@@ -327,7 +342,7 @@ void send_peripheral_state(Panda *panda, PubMaster *pm) {
 }

 void process_panda_state(std::vector<Panda *> &pandas, PubMaster *pm, bool spoofing_started) {
-  static SubMaster sm({"selfdriveState"});
+  static SubMaster sm({"selfdriveState", "selfdriveStateSP", "carParams"});

  std::vector<std::string> connected_serials;
  for (Panda *p : pandas) {
@@ -366,8 +381,9 @@ void process_panda_state(std::vector<Panda *> &pandas, PubMaster *pm, bool spoof

    sm.update(0);
    const bool engaged = sm.allAliveAndValid({"selfdriveState"}) && sm["selfdriveState"].getSelfdriveState().getEnabled();
+    const bool engaged_mads = process_mads_heartbeat(&sm);
    for (const auto &panda : pandas) {
-      panda->send_heartbeat(engaged);
+      panda->send_heartbeat(engaged, engaged_mads);
    }
  }
 }
@@ -105,6 +105,24 @@ class Events:
      ret.append(event)
    return ret

+  def has(self, event_name: int) -> bool:
+    return event_name in self.events
+
+  def contains_in_list(self, events_list: list[int]) -> bool:
+    return any(event_name in self.events for event_name in events_list)
+
+  def remove(self, event_name: int, static: bool = False) -> None:
+    if static and event_name in self.static_events:
+      self.static_events.remove(event_name)
+
+    if event_name in self.events:
+      self.event_counters[event_name] = self.event_counters[event_name] + 1
+      self.events.remove(event_name)
+
+  def replace(self, prev_event_name: int, cur_event_name: int, static: bool = False) -> None:
+    self.remove(prev_event_name, static)
+    self.add(cur_event_name, static)
+

 class Alert:
  def __init__(self,
@@ -951,6 +969,99 @@ EVENTS: dict[int, dict[str, Alert | AlertCallbackType]] = {
    ET.WARNING: personality_changed_alert,
  },

+  # sunnypilot
+  EventName.lkasEnable: {
+    ET.ENABLE: EngagementAlert(AudibleAlert.engage),
+  },
+
+  EventName.lkasDisable: {
+    ET.USER_DISABLE: EngagementAlert(AudibleAlert.disengage),
+  },
+
+  EventName.manualSteeringRequired: {
+    ET.USER_DISABLE: Alert(
+      "Automatic Lane Centering is OFF",
+      "Manual Steering Required",
+      AlertStatus.normal, AlertSize.mid,
+      Priority.LOW, VisualAlert.none, AudibleAlert.disengage, 1.),
+  },
+
+  EventName.manualLongitudinalRequired: {
+    ET.WARNING: Alert(
+      "Smart/Adaptive Cruise Control: OFF",
+      "Manual Speed Control Required",
+      AlertStatus.normal, AlertSize.mid,
+      Priority.LOW, VisualAlert.none, AudibleAlert.none, 1.),
+  },
+
+  EventName.silentLkasEnable: {
+    ET.ENABLE: EngagementAlert(AudibleAlert.none),
+  },
+
+  EventName.silentLkasDisable: {
+    ET.USER_DISABLE: EngagementAlert(AudibleAlert.none),
+  },
+
+  EventName.silentBrakeHold: {
+    ET.USER_DISABLE: EngagementAlert(AudibleAlert.none),
+    ET.NO_ENTRY: NoEntryAlert("Brake Hold Active"),
+  },
+
+  EventName.silentWrongGear: {
+    ET.WARNING: Alert(
+      "",
+      "",
+      AlertStatus.normal, AlertSize.none,
+      Priority.LOWEST, VisualAlert.none, AudibleAlert.none, 0.),
+    ET.NO_ENTRY: Alert(
+      "Gear not D",
+      "openpilot Unavailable",
+      AlertStatus.normal, AlertSize.mid,
+      Priority.LOW, VisualAlert.none, AudibleAlert.none, 0.),
+  },
+
+  EventName.silentReverseGear: {
+    ET.PERMANENT: Alert(
+      "Reverse\nGear",
+      "",
+      AlertStatus.normal, AlertSize.full,
+      Priority.LOWEST, VisualAlert.none, AudibleAlert.none, .2, creation_delay=0.5),
+    ET.NO_ENTRY: NoEntryAlert("Reverse Gear"),
+  },
+
+  EventName.silentDoorOpen: {
+    ET.WARNING: Alert(
+      "",
+      "",
+      AlertStatus.normal, AlertSize.none,
+      Priority.LOWEST, VisualAlert.none, AudibleAlert.none, 0.),
+    ET.NO_ENTRY: NoEntryAlert("Door Open"),
+  },
+
+  EventName.silentSeatbeltNotLatched: {
+    ET.WARNING: Alert(
+      "",
+      "",
+      AlertStatus.normal, AlertSize.none,
+      Priority.LOWEST, VisualAlert.none, AudibleAlert.none, 0.),
+    ET.NO_ENTRY: NoEntryAlert("Seatbelt Unlatched"),
+  },
+
+  EventName.silentParkBrake: {
+    ET.WARNING: Alert(
+      "",
+      "",
+      AlertStatus.normal, AlertSize.none,
+      Priority.LOWEST, VisualAlert.none, AudibleAlert.none, 0.),
+    ET.NO_ENTRY: NoEntryAlert("Parking Brake Engaged"),
+  },
+
+  EventName.controlsMismatchLateral: {
+    ET.IMMEDIATE_DISABLE: ImmediateDisableAlert("Controls Mismatch: Lateral"),
+    ET.NO_ENTRY: NoEntryAlert("Controls Mismatch: Lateral"),
+  },
+
+
 }


@@ -23,6 +23,8 @@ from openpilot.selfdrive.controls.lib.latcontrol import MIN_LATERAL_CONTROL_SPEE

 from openpilot.system.version import get_build_metadata

+from openpilot.sunnypilot.mads.mads import ModularAssistiveDrivingSystem
+
 REPLAY = "REPLAY" in os.environ
 SIMULATION = "SIMULATION" in os.environ
 TESTING_CLOSET = "TESTING_CLOSET" in os.environ
@@ -131,6 +133,10 @@ class SelfdriveD:
    elif self.CP.passive:
      self.events.add(EventName.dashcamMode, static=True)

+    self.mads = ModularAssistiveDrivingSystem(self)
+    sock_services = list(self.pm.sock.keys()) + ['selfdriveStateSP']
+    self.pm = messaging.PubMaster(sock_services)
+
  def update_events(self, CS):
    """Compute onroadEvents from carState"""

@@ -451,11 +457,25 @@ class SelfdriveD:
      self.pm.send('onroadEvents', ce_send)
    self.events_prev = self.events.names.copy()

+    # selfdriveStateSP
+    ss_sp_msg = messaging.new_message('selfdriveStateSP')
+    ss_sp_msg.valid = True
+    ss_sp = ss_sp_msg.selfdriveStateSP
+    mads = ss_sp.mads
+    mads.state = self.mads.state_machine.state
+    mads.enabled = self.mads.enabled
+    mads.active = self.mads.active
+    mads.available = self.mads.enabled_toggle
+
+    self.pm.send('selfdriveStateSP', ss_sp_msg)
+
  def step(self):
    CS = self.data_sample()
    self.update_events(CS)
    if not self.CP.passive and self.initialized:
      self.enabled, self.active = self.state_machine.update(self.events)
+    if not self.CP.notCar:
+      self.mads.update(CS, self.sm)
    self.update_alerts(CS)

    self.publish_selfdriveState(CS)
@@ -473,6 +493,8 @@ class SelfdriveD:
      self.is_metric = self.params.get_bool("IsMetric")
      self.experimental_mode = self.params.get_bool("ExperimentalMode") and self.CP.openpilotLongitudinalControl
      self.personality = self.read_personality_param()
+
+      self.mads.read_params()
      time.sleep(0.1)

  def run(self):
@@ -41,7 +41,7 @@ class TestAlerts:
    events = log.OnroadEvent.EventName.schema.enumerants

    for name, e in events.items():
-      if not name.endswith("DEPRECATED"):
+      if not name.endswith("DEPRECATED") and not name.startswith("eventReserved"):
        fail_msg = f"{name} @{e} not in EVENTS"
        assert e in EVENTS.keys(), fail_msg

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:737f95d34912db53a303ba6499e6f697b510fa5872b8c71f701a4fe924b5466e
-size 356169
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:79ccc3bd2094ba8a55adedf0007b0152eb3a68edc5e2d35aeccbba122d5811c6
+size 356151
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e359e8f6b5a22b6f3f89b54989dac2110ee3a4463de2d785be83e20cda4f1cb6
-size 254248
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45f39fe1d1dc8c271f577d1a12812e01da34979bf3fffaad01a19a7a61b6d456
+size 256342
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a8e4d044812a714ebdf0b15e73d4466e9ddaafa374368f308803c6b68dcd79ab
-size 332433
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec8f8d879b38a4c8d4319a3bd67872d5e5d348bc5472443c4c8aa1cb1dd62cf5
+size 332404
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:53b80c3c99a6897cafe7d408872c287ab0bbaac2751e344cf4623b312d2e4866
-size 268928
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:68afd54dc68f6abff699d2740f90830b0f492db8818869e8936a63e7bed80458
+size 268827
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a4fa8a841c964e90b9c14b5aede8f30de949d319ae072cc291f0afb1c4c24baa
-size 437808
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:32a458c0b364c1c793a4a843fe7a8fd21dd4ad45ff87dc8ad41a8e8759e52c80
+size 437811
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ce370994de01d6240fa753846ddc7e1b852acac3f649a4c29ea16acae126f974
-size 308578
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7b3d412d4066f9ac90788e11a20d99ea3ff92eafdc8a317610b7d9c918aedd59
+size 308644
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:82333cfc026735eac81defae9e01b82b81ac30fd01ad265dbdd311dd97322198
-size 393106
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8ad0c3881d7d88f9038a1b3a5e43c779e84dca47e2f7f6d45a3449cd8a7dec99
+size 393122
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e4ebae1742e3bb7d89f8cd452928c5c6a1b8679155562d23c6303ca4ec2e3b02
-size 334350
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6a0f9dc3743e4e09375f7a3289228b64486915ae4ef6dfcff66572a34820d5d2
+size 334502
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:03326fb4486c1ffdd3609a0e0200e65d1f30dca5d9b047501d98b1b2d5321e75
-size 470495
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:944406ae64efb422b568437588fec6a8a8b5b7d9e577dc1d22c83a1d6d3813ff
+size 470417
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:bffa062a5eace4d499c2fe12f2b0eb8c71207f4e7bea0e88456614f00e2859b1
-size 258989
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d80c61dc2a0b3685a522013c8aa8347adfdfc6d0611485da88b3a17771c68301
+size 260913
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:fe97d060fb6e60652b26b26406ca8b03890f6bff4b4bc5cb73fca267068a04c9
-size 217460
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3271e31149f7053d134d3a0217851167e573647a8b415138e8c80d429d0a02c9
+size 217488
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:60e33d209f4f600acf344feca8ab6494a1bf3aafe3c7121e2110a82ea06aa61e
-size 293084
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:35c95eaf222affae2e236f0b717c17e7f3b8a30e85d4ebb4ad1506e5025ce0b8
+size 293078
--- a/Show More
+++ b/Show More