Merge branch 'refs/heads/model-manager-improvements' into model-selector-multi-runner-debug-model-download

Enable model label button only when conditions are met
Previously, the button's state update was misplaced, leading to potential issues with its interactive availability. The logic has been adjusted to ensure it is properly enabled or disabled based on onroad status and download progress. This change improves UX consistency and prevents unintended actions.
2026-06-27 03:02:05 +08:00 · 2025-01-05 22:55:45 +01:00 · 2025-01-05 22:54:29 +01:00 · 2025-01-05 22:50:05 +01:00 · 2025-01-05 22:28:53 +01:00 · 2025-01-05 22:28:26 +01:00
60 changed files with 2889 additions and 8 deletions
@@ -3,6 +3,8 @@
 # to move existing files into LFS:
 # git add --renormalize .
 *.onnx filter=lfs diff=lfs merge=lfs -text
 *.thneed filter=lfs diff=lfs merge=lfs -text
 *.pkl filter=lfs diff=lfs merge=lfs -text
 *.svg filter=lfs diff=lfs merge=lfs -text
 *.png filter=lfs diff=lfs merge=lfs -text
 *.gif filter=lfs diff=lfs merge=lfs -text
@@ -74,6 +74,7 @@ comma*.sh
 selfdrive/modeld/thneed/compile
 selfdrive/modeld/models/*.thneed
 selfdrive/modeld/models/*.pkl
 sunnypilot/modeld/thneed/compile
 *.bz2
 *.zst
@@ -15,4 +15,4 @@
  url = https://github.com/commaai/teleoprtc
 [submodule "tinygrad"]
  path = tinygrad_repo
-  url = https://github.com/tinygrad/tinygrad.git
+  url = https://github.com/commaai/tinygrad.git
@@ -396,6 +396,8 @@ SConscript(['third_party/SConscript'])
 SConscript(['selfdrive/SConscript'])
 SConscript(['sunnypilot/SConscript'])
 if Dir('#tools/cabana/').exists() and GetOption('extras'):
  SConscript(['tools/replay/SConscript'])
  if arch != "larch64":
@@ -64,6 +64,11 @@ struct ModelManagerSP @0xaedffd8f31e7b55d {
    progress @1 :Float32;
    eta @2 :UInt32;
  }
  enum Runner {
    snpe @0;
    tinygrad @1;
  }
  struct ModelBundle {
    index @0 :UInt32;
@@ -212,7 +212,7 @@ std::unordered_map<std::string, uint32_t> keys = {
    // Model Manager params
    {"ModelManager_ActiveBundle", PERSISTENT},
-    {"ModelManager_DownloadIndex", CLEAR_ON_MANAGER_START | CLEAR_ON_OFFROAD_TRANSITION | CLEAR_ON_ONROAD_TRANSITION},
+    {"ModelManager_DownloadIndex", CLEAR_ON_MANAGER_START | CLEAR_ON_ONROAD_TRANSITION},
    {"ModelManager_LastSyncTime", CLEAR_ON_MANAGER_START | CLEAR_ON_OFFROAD_TRANSITION},
    {"ModelManager_ModelsCache", PERSISTENT | BACKUP},
@@ -137,7 +137,7 @@ allow-direct-references = true
 [tool.pytest.ini_options]
 minversion = "6.0"
-addopts = "--ignore=openpilot/ --ignore=opendbc/ --ignore=panda/ --ignore=rednose_repo/ --ignore=tinygrad_repo/ --ignore=teleoprtc_repo/ --ignore=msgq/  -Werror --strict-config --strict-markers --durations=10 -n auto --dist=loadgroup"
+addopts = "--ignore=openpilot/ --ignore=opendbc/ --ignore=panda/ --ignore=rednose_repo/ --ignore=tinygrad_repo/ --ignore=teleoprtc_repo/ --ignore=msgq/ --ignore=sunnypilot/tinygrad_repo/  -Werror --strict-config --strict-markers --durations=10 -n auto --dist=loadgroup"
 cpp_files = "test_*"
 cpp_harness = "selfdrive/test/cpp_harness.py"
 python_files = "test_*.py"
@@ -83,8 +83,6 @@ void SoftwarePanelSP::handleBundleDownloadProgress() {
  if (bundle.getStatus() == cereal::ModelManagerSP::DownloadStatus::DOWNLOADING) {
    currentModelLblBtn->showDescription();
  }
  currentModelLblBtn->setEnabled(!is_onroad && !isDownloading());
 }
 /**
@@ -161,6 +159,7 @@ void SoftwarePanelSP::updateLabels() {
  }
  handleBundleDownloadProgress();
  currentModelLblBtn->setEnabled(!is_onroad && !isDownloading());
  currentModelLblBtn->setValue(GetActiveModelName());
  SoftwarePanel::updateLabels();
 }
@@ -24,7 +24,7 @@ private:
    const SubMaster &sm = *(uiStateSP()->sm);
    const auto model_manager = sm["modelManagerSP"].getModelManagerSP();
-    if (!model_manager.hasSelectedBundle()) {
+    if (!model_manager.hasSelectedBundle() || !sm.updated("modelManagerSP")) {
      return false;
    }
@@ -0,0 +1 @@
 SConscript(['modeld/SConscript'])
@@ -0,0 +1 @@
 *_pyx.cpp
@@ -0,0 +1,58 @@
 import glob
 Import('env', 'envCython', 'arch', 'cereal', 'messaging', 'common', 'gpucommon', 'visionipc', 'transformations')
 lenv = env.Clone()
 lenvCython = envCython.Clone()
 libs = [cereal, messaging, visionipc, gpucommon, common, 'capnp', 'kj', 'pthread']
 frameworks = []
 common_src = [
  "models/commonmodel.cc",
  "transforms/loadyuv.cc",
  "transforms/transform.cc",
 ]
 thneed_src_common = [
  "thneed/thneed_common.cc",
  "thneed/serialize.cc",
 ]
 thneed_src_qcom = thneed_src_common + ["thneed/thneed_qcom2.cc"]
 thneed_src_pc = thneed_src_common + ["thneed/thneed_pc.cc"]
 thneed_src = thneed_src_qcom if arch == "larch64" else thneed_src_pc
 # SNPE except on Mac and ARM Linux
 snpe_lib = []
 if arch != "Darwin" and arch != "aarch64":
  common_src += ['runners/snpemodel.cc']
  snpe_lib += ['SNPE']
 # OpenCL is a framework on Mac
 if arch == "Darwin":
  frameworks += ['OpenCL']
 else:
  libs += ['OpenCL']
 # Set path definitions
 for pathdef, fn in {'TRANSFORM': 'transforms/transform.cl', 'LOADYUV': 'transforms/loadyuv.cl'}.items():
  for xenv in (lenv, lenvCython):
    xenv['CXXFLAGS'].append(f'-D{pathdef}_PATH=\\"{File(fn).abspath}\\"')
 # Compile cython
 snpe_rpath_qcom = "/data/pythonpath/third_party/snpe/larch64"
 snpe_rpath_pc = f"{Dir('#').abspath}/third_party/snpe/x86_64-linux-clang"
 snpe_rpath = lenvCython['RPATH'] + [snpe_rpath_qcom if arch == "larch64" else snpe_rpath_pc]
 cython_libs = envCython["LIBS"] + libs
 snpemodel_lib = lenv.Library('snpemodel', ['runners/snpemodel.cc'])
 commonmodel_lib = lenv.Library('commonmodel', common_src)
 lenvCython.Program('runners/runmodel_pyx.so', 'runners/runmodel_pyx.pyx', LIBS=cython_libs, FRAMEWORKS=frameworks)
 lenvCython.Program('runners/snpemodel_pyx.so', 'runners/snpemodel_pyx.pyx', LIBS=[snpemodel_lib, snpe_lib, *cython_libs], FRAMEWORKS=frameworks, RPATH=snpe_rpath)
 lenvCython.Program('models/commonmodel_pyx.so', 'models/commonmodel_pyx.pyx', LIBS=[commonmodel_lib, *cython_libs], FRAMEWORKS=frameworks)
 if arch == 'larch64' or GetOption('pc_thneed'):
  thneed_lib = env.SharedLibrary('thneed', thneed_src, LIBS=[gpucommon, common, 'OpenCL', 'dl'])
  thneedmodel_lib = env.Library('thneedmodel', ['runners/thneedmodel.cc'])
  lenvCython.Program('runners/thneedmodel_pyx.so', 'runners/thneedmodel_pyx.pyx', LIBS=envCython["LIBS"]+[thneedmodel_lib, thneed_lib, gpucommon, common, 'dl', 'OpenCL'])
@@ -0,0 +1,86 @@
 import numpy as np
 def index_function(idx, max_val=192, max_idx=32):
  return (max_val) * ((idx/max_idx)**2)
 class ModelConstants:
  # time and distance indices
  IDX_N = 33
  T_IDXS = [index_function(idx, max_val=10.0) for idx in range(IDX_N)]
  X_IDXS = [index_function(idx, max_val=192.0) for idx in range(IDX_N)]
  LEAD_T_IDXS = [0., 2., 4., 6., 8., 10.]
  LEAD_T_OFFSETS = [0., 2., 4.]
  META_T_IDXS = [2., 4., 6., 8., 10.]
  # model inputs constants
  MODEL_FREQ = 20
  FEATURE_LEN = 512
  FULL_HISTORY_BUFFER_LEN = 99
  HISTORY_BUFFER_LEN = 24
  DESIRE_LEN = 8
  TRAFFIC_CONVENTION_LEN = 2
  LAT_PLANNER_STATE_LEN = 4
  LATERAL_CONTROL_PARAMS_LEN = 2
  PREV_DESIRED_CURV_LEN = 1
  # model outputs constants
  FCW_THRESHOLDS_5MS2 = np.array([.05, .05, .15, .15, .15], dtype=np.float32)
  FCW_THRESHOLDS_3MS2 = np.array([.7, .7], dtype=np.float32)
  FCW_5MS2_PROBS_WIDTH = 5
  FCW_3MS2_PROBS_WIDTH = 2
  DISENGAGE_WIDTH = 5
  POSE_WIDTH = 6
  WIDE_FROM_DEVICE_WIDTH = 3
  SIM_POSE_WIDTH = 6
  LEAD_WIDTH = 4
  LANE_LINES_WIDTH = 2
  ROAD_EDGES_WIDTH = 2
  PLAN_WIDTH = 15
  DESIRE_PRED_WIDTH = 8
  LAT_PLANNER_SOLUTION_WIDTH = 4
  DESIRED_CURV_WIDTH = 1
  NUM_LANE_LINES = 4
  NUM_ROAD_EDGES = 2
  LEAD_TRAJ_LEN = 6
  DESIRE_PRED_LEN = 4
  PLAN_MHP_N = 5
  LEAD_MHP_N = 2
  PLAN_MHP_SELECTION = 1
  LEAD_MHP_SELECTION = 3
  FCW_THRESHOLD_5MS2_HIGH = 0.15
  FCW_THRESHOLD_5MS2_LOW = 0.05
  FCW_THRESHOLD_3MS2 = 0.7
  CONFIDENCE_BUFFER_LEN = 5
  RYG_GREEN = 0.01165
  RYG_YELLOW = 0.06157
  POLY_PATH_DEGREE = 4
 # model outputs slices
 class Plan:
  POSITION = slice(0, 3)
  VELOCITY = slice(3, 6)
  ACCELERATION = slice(6, 9)
  T_FROM_CURRENT_EULER = slice(9, 12)
  ORIENTATION_RATE = slice(12, 15)
 class Meta:
  ENGAGED = slice(0, 1)
  # next 2, 4, 6, 8, 10 seconds
  GAS_DISENGAGE = slice(1, 31, 6)
  BRAKE_DISENGAGE = slice(2, 31, 6)
  STEER_OVERRIDE = slice(3, 31, 6)
  HARD_BRAKE_3 = slice(4, 31, 6)
  HARD_BRAKE_4 = slice(5, 31, 6)
  HARD_BRAKE_5 = slice(6, 31, 6)
  # next 0, 2, 4, 6, 8, 10 seconds
  GAS_PRESS = slice(31, 55, 4)
  BRAKE_PRESS = slice(32, 55, 4)
  LEFT_BLINKER = slice(33, 55, 4)
  RIGHT_BLINKER = slice(34, 55, 4)
@@ -0,0 +1,237 @@
 import os
 import capnp
 import numpy as np
 from cereal import log
 from openpilot.sunnypilot.modeld.constants import ModelConstants, Plan, Meta
 from openpilot.selfdrive.controls.lib.drive_helpers import MIN_SPEED
 SEND_RAW_PRED = os.getenv('SEND_RAW_PRED')
 ConfidenceClass = log.ModelDataV2.ConfidenceClass
 def curv_from_psis(psi_target, psi_rate, vego, delay):
  vego = np.clip(vego, MIN_SPEED, np.inf)
  curv_from_psi = psi_target / (vego * delay)  # epsilon to prevent divide-by-zero
  return 2*curv_from_psi - psi_rate / vego
 def get_curvature_from_plan(plan, vego, delay):
  psi_target = np.interp(delay, ModelConstants.T_IDXS, plan[:, Plan.T_FROM_CURRENT_EULER][:, 2])
  psi_rate = plan[:, Plan.ORIENTATION_RATE][0, 2]
  return curv_from_psis(psi_target, psi_rate, vego, delay)
 class PublishState:
  def __init__(self):
    self.disengage_buffer = np.zeros(ModelConstants.CONFIDENCE_BUFFER_LEN*ModelConstants.DISENGAGE_WIDTH, dtype=np.float32)
    self.prev_brake_5ms2_probs = np.zeros(ModelConstants.FCW_5MS2_PROBS_WIDTH, dtype=np.float32)
    self.prev_brake_3ms2_probs = np.zeros(ModelConstants.FCW_3MS2_PROBS_WIDTH, dtype=np.float32)
 def fill_xyzt(builder, t, x, y, z, x_std=None, y_std=None, z_std=None):
  builder.t = t
  builder.x = x.tolist()
  builder.y = y.tolist()
  builder.z = z.tolist()
  if x_std is not None:
    builder.xStd = x_std.tolist()
  if y_std is not None:
    builder.yStd = y_std.tolist()
  if z_std is not None:
    builder.zStd = z_std.tolist()
 def fill_xyvat(builder, t, x, y, v, a, x_std=None, y_std=None, v_std=None, a_std=None):
  builder.t = t
  builder.x = x.tolist()
  builder.y = y.tolist()
  builder.v = v.tolist()
  builder.a = a.tolist()
  if x_std is not None:
    builder.xStd = x_std.tolist()
  if y_std is not None:
    builder.yStd = y_std.tolist()
  if v_std is not None:
    builder.vStd = v_std.tolist()
  if a_std is not None:
    builder.aStd = a_std.tolist()
 def fill_xyz_poly(builder, degree, x, y, z):
  xyz = np.stack([x, y, z], axis=1)
  coeffs = np.polynomial.polynomial.polyfit(ModelConstants.T_IDXS, xyz, deg=degree)
  builder.xCoefficients = coeffs[:, 0].tolist()
  builder.yCoefficients = coeffs[:, 1].tolist()
  builder.zCoefficients = coeffs[:, 2].tolist()
 def fill_lane_line_meta(builder, lane_lines, lane_line_probs):
  builder.leftY = lane_lines[1].y[0]
  builder.leftProb = lane_line_probs[1]
  builder.rightY = lane_lines[2].y[0]
  builder.rightProb = lane_line_probs[2]
 def fill_model_msg(base_msg: capnp._DynamicStructBuilder, extended_msg: capnp._DynamicStructBuilder,
                   net_output_data: dict[str, np.ndarray], v_ego: float, delay: float,
                   publish_state: PublishState, vipc_frame_id: int, vipc_frame_id_extra: int,
                   frame_id: int, frame_drop: float, timestamp_eof: int, model_execution_time: float,
                   valid: bool) -> None:
  frame_age = frame_id - vipc_frame_id if frame_id > vipc_frame_id else 0
  frame_drop_perc = frame_drop * 100
  extended_msg.valid = valid
  base_msg.valid = valid
  desired_curv = float(get_curvature_from_plan(net_output_data['plan'][0], v_ego, delay))
  driving_model_data = base_msg.drivingModelData
  driving_model_data.frameId = vipc_frame_id
  driving_model_data.frameIdExtra = vipc_frame_id_extra
  driving_model_data.frameDropPerc = frame_drop_perc
  driving_model_data.modelExecutionTime = model_execution_time
  action = driving_model_data.action
  action.desiredCurvature = desired_curv
  modelV2 = extended_msg.modelV2
  modelV2.frameId = vipc_frame_id
  modelV2.frameIdExtra = vipc_frame_id_extra
  modelV2.frameAge = frame_age
  modelV2.frameDropPerc = frame_drop_perc
  modelV2.timestampEof = timestamp_eof
  modelV2.modelExecutionTime = model_execution_time
  # plan
  position = modelV2.position
  fill_xyzt(position, ModelConstants.T_IDXS, *net_output_data['plan'][0,:,Plan.POSITION].T, *net_output_data['plan_stds'][0,:,Plan.POSITION].T)
  velocity = modelV2.velocity
  fill_xyzt(velocity, ModelConstants.T_IDXS, *net_output_data['plan'][0,:,Plan.VELOCITY].T)
  acceleration = modelV2.acceleration
  fill_xyzt(acceleration, ModelConstants.T_IDXS, *net_output_data['plan'][0,:,Plan.ACCELERATION].T)
  orientation = modelV2.orientation
  fill_xyzt(orientation, ModelConstants.T_IDXS, *net_output_data['plan'][0,:,Plan.T_FROM_CURRENT_EULER].T)
  orientation_rate = modelV2.orientationRate
  fill_xyzt(orientation_rate, ModelConstants.T_IDXS, *net_output_data['plan'][0,:,Plan.ORIENTATION_RATE].T)
  # temporal pose
  temporal_pose = modelV2.temporalPose
  temporal_pose.trans = net_output_data['plan'][0,0,Plan.VELOCITY].tolist()
  temporal_pose.transStd = net_output_data['plan_stds'][0,0,Plan.VELOCITY].tolist()
  temporal_pose.rot = net_output_data['plan'][0,0,Plan.ORIENTATION_RATE].tolist()
  temporal_pose.rotStd = net_output_data['plan_stds'][0,0,Plan.ORIENTATION_RATE].tolist()
  # poly path
  poly_path = driving_model_data.path
  fill_xyz_poly(poly_path, ModelConstants.POLY_PATH_DEGREE, *net_output_data['plan'][0,:,Plan.POSITION].T)
  # lateral planning
  action = modelV2.action
  action.desiredCurvature = desired_curv
  # times at X_IDXS according to model plan
  PLAN_T_IDXS = [np.nan] * ModelConstants.IDX_N
  PLAN_T_IDXS[0] = 0.0
  plan_x = net_output_data['plan'][0,:,Plan.POSITION][:,0].tolist()
  for xidx in range(1, ModelConstants.IDX_N):
    tidx = 0
    # increment tidx until we find an element that's further away than the current xidx
    while tidx < ModelConstants.IDX_N - 1 and plan_x[tidx+1] < ModelConstants.X_IDXS[xidx]:
      tidx += 1
    if tidx == ModelConstants.IDX_N - 1:
      # if the Plan doesn't extend far enough, set plan_t to the max value (10s), then break
      PLAN_T_IDXS[xidx] = ModelConstants.T_IDXS[ModelConstants.IDX_N - 1]
      break
    # interpolate to find `t` for the current xidx
    current_x_val = plan_x[tidx]
    next_x_val = plan_x[tidx+1]
    p = (ModelConstants.X_IDXS[xidx] - current_x_val) / (next_x_val - current_x_val) if abs(next_x_val - current_x_val) > 1e-9 else float('nan')
    PLAN_T_IDXS[xidx] = p * ModelConstants.T_IDXS[tidx+1] + (1 - p) * ModelConstants.T_IDXS[tidx]
  # lane lines
  modelV2.init('laneLines', 4)
  for i in range(4):
    lane_line = modelV2.laneLines[i]
    fill_xyzt(lane_line, PLAN_T_IDXS, np.array(ModelConstants.X_IDXS), net_output_data['lane_lines'][0,i,:,0], net_output_data['lane_lines'][0,i,:,1])
  modelV2.laneLineStds = net_output_data['lane_lines_stds'][0,:,0,0].tolist()
  modelV2.laneLineProbs = net_output_data['lane_lines_prob'][0,1::2].tolist()
  lane_line_meta = driving_model_data.laneLineMeta
  fill_lane_line_meta(lane_line_meta, modelV2.laneLines, modelV2.laneLineProbs)
  # road edges
  modelV2.init('roadEdges', 2)
  for i in range(2):
    road_edge = modelV2.roadEdges[i]
    fill_xyzt(road_edge, PLAN_T_IDXS, np.array(ModelConstants.X_IDXS), net_output_data['road_edges'][0,i,:,0], net_output_data['road_edges'][0,i,:,1])
  modelV2.roadEdgeStds = net_output_data['road_edges_stds'][0,:,0,0].tolist()
  # leads
  modelV2.init('leadsV3', 3)
  for i in range(3):
    lead = modelV2.leadsV3[i]
    fill_xyvat(lead, ModelConstants.LEAD_T_IDXS, *net_output_data['lead'][0,i].T, *net_output_data['lead_stds'][0,i].T)
    lead.prob = net_output_data['lead_prob'][0,i].tolist()
    lead.probTime = ModelConstants.LEAD_T_OFFSETS[i]
  # meta
  meta = modelV2.meta
  meta.desireState = net_output_data['desire_state'][0].reshape(-1).tolist()
  meta.desirePrediction = net_output_data['desire_pred'][0].reshape(-1).tolist()
  meta.engagedProb = net_output_data['meta'][0,Meta.ENGAGED].item()
  meta.init('disengagePredictions')
  disengage_predictions = meta.disengagePredictions
  disengage_predictions.t = ModelConstants.META_T_IDXS
  disengage_predictions.brakeDisengageProbs = net_output_data['meta'][0,Meta.BRAKE_DISENGAGE].tolist()
  disengage_predictions.gasDisengageProbs = net_output_data['meta'][0,Meta.GAS_DISENGAGE].tolist()
  disengage_predictions.steerOverrideProbs = net_output_data['meta'][0,Meta.STEER_OVERRIDE].tolist()
  disengage_predictions.brake3MetersPerSecondSquaredProbs = net_output_data['meta'][0,Meta.HARD_BRAKE_3].tolist()
  disengage_predictions.brake4MetersPerSecondSquaredProbs = net_output_data['meta'][0,Meta.HARD_BRAKE_4].tolist()
  disengage_predictions.brake5MetersPerSecondSquaredProbs = net_output_data['meta'][0,Meta.HARD_BRAKE_5].tolist()
  disengage_predictions.gasPressProbs = net_output_data['meta'][0,Meta.GAS_PRESS].tolist()
  disengage_predictions.brakePressProbs = net_output_data['meta'][0,Meta.BRAKE_PRESS].tolist()
  publish_state.prev_brake_5ms2_probs[:-1] = publish_state.prev_brake_5ms2_probs[1:]
  publish_state.prev_brake_5ms2_probs[-1] = net_output_data['meta'][0,Meta.HARD_BRAKE_5][0]
  publish_state.prev_brake_3ms2_probs[:-1] = publish_state.prev_brake_3ms2_probs[1:]
  publish_state.prev_brake_3ms2_probs[-1] = net_output_data['meta'][0,Meta.HARD_BRAKE_3][0]
  hard_brake_predicted = (publish_state.prev_brake_5ms2_probs > ModelConstants.FCW_THRESHOLDS_5MS2).all() and \
    (publish_state.prev_brake_3ms2_probs > ModelConstants.FCW_THRESHOLDS_3MS2).all()
  meta.hardBrakePredicted = hard_brake_predicted.item()
  # confidence
  if vipc_frame_id % (2*ModelConstants.MODEL_FREQ) == 0:
    # any disengage prob
    brake_disengage_probs = net_output_data['meta'][0,Meta.BRAKE_DISENGAGE]
    gas_disengage_probs = net_output_data['meta'][0,Meta.GAS_DISENGAGE]
    steer_override_probs = net_output_data['meta'][0,Meta.STEER_OVERRIDE]
    any_disengage_probs = 1-((1-brake_disengage_probs)*(1-gas_disengage_probs)*(1-steer_override_probs))
    # independent disengage prob for each 2s slice
    ind_disengage_probs = np.r_[any_disengage_probs[0], np.diff(any_disengage_probs) / (1 - any_disengage_probs[:-1])]
    # rolling buf for 2, 4, 6, 8, 10s
    publish_state.disengage_buffer[:-ModelConstants.DISENGAGE_WIDTH] = publish_state.disengage_buffer[ModelConstants.DISENGAGE_WIDTH:]
    publish_state.disengage_buffer[-ModelConstants.DISENGAGE_WIDTH:] = ind_disengage_probs
  score = 0.
  for i in range(ModelConstants.DISENGAGE_WIDTH):
    score += publish_state.disengage_buffer[i*ModelConstants.DISENGAGE_WIDTH+ModelConstants.DISENGAGE_WIDTH-1-i].item() / ModelConstants.DISENGAGE_WIDTH
  if score < ModelConstants.RYG_GREEN:
    modelV2.confidence = ConfidenceClass.green
  elif score < ModelConstants.RYG_YELLOW:
    modelV2.confidence = ConfidenceClass.yellow
  else:
    modelV2.confidence = ConfidenceClass.red
  # raw prediction if enabled
  if SEND_RAW_PRED:
    modelV2.rawPredictions = net_output_data['raw_pred'].tobytes()
 def fill_pose_msg(msg: capnp._DynamicStructBuilder, net_output_data: dict[str, np.ndarray],
                  vipc_frame_id: int, vipc_dropped_frames: int, timestamp_eof: int, live_calib_seen: bool) -> None:
  msg.valid = live_calib_seen & (vipc_dropped_frames < 1)
  cameraOdometry = msg.cameraOdometry
  cameraOdometry.frameId = vipc_frame_id
  cameraOdometry.timestampEof = timestamp_eof
  cameraOdometry.trans = net_output_data['pose'][0,:3].tolist()
  cameraOdometry.rot = net_output_data['pose'][0,3:].tolist()
  cameraOdometry.wideFromDeviceEuler = net_output_data['wide_from_device_euler'][0,:].tolist()
  cameraOdometry.roadTransformTrans = net_output_data['road_transform'][0,:3].tolist()
  cameraOdometry.transStd = net_output_data['pose_stds'][0,:3].tolist()
  cameraOdometry.rotStd = net_output_data['pose_stds'][0,3:].tolist()
  cameraOdometry.wideFromDeviceEulerStd = net_output_data['wide_from_device_euler_stds'][0,:].tolist()
  cameraOdometry.roadTransformTransStd = net_output_data['road_transform_stds'][0,:3].tolist()
@@ -0,0 +1,28 @@
 #!/usr/bin/env python3
 import sys
 import pathlib
 import onnx
 import codecs
 import pickle
 def get_name_and_shape(value_info:onnx.ValueInfoProto) -> tuple[str, tuple[int,...]]:
  shape = tuple([int(dim.dim_value) for dim in value_info.type.tensor_type.shape.dim])
  name = value_info.name
  return name, shape
 if __name__ == "__main__":
  model_path = pathlib.Path(sys.argv[1])
  model = onnx.load(str(model_path))
  i = [x.key for x in model.metadata_props].index('output_slices')
  output_slices = model.metadata_props[i].value
  metadata = {}
  metadata['output_slices'] = pickle.loads(codecs.decode(output_slices.encode(), "base64"))
  metadata['input_shapes'] = dict([get_name_and_shape(x) for x in model.graph.input])
  metadata['output_shapes'] = dict([get_name_and_shape(x) for x in model.graph.output])
  metadata_path = model_path.parent / (model_path.stem + '_metadata.pkl')
  with open(metadata_path, 'wb') as f:
    pickle.dump(metadata, f)
  print(f'saved metadata to {metadata_path}')
@@ -0,0 +1,10 @@
 #!/usr/bin/env bash
 DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null && pwd)"
 cd "$DIR/../../"
 if [ -f "$DIR/libthneed.so" ]; then
  export LD_PRELOAD="$DIR/libthneed.so"
 fi
 exec "$DIR/modeld.py" "$@"
@@ -0,0 +1,299 @@
 #!/usr/bin/env python3
 import os
 import time
 import pickle
 import numpy as np
 import cereal.messaging as messaging
 from cereal import car, log
 from pathlib import Path
 from setproctitle import setproctitle
 from cereal.messaging import PubMaster, SubMaster
 from msgq.visionipc import VisionIpcClient, VisionStreamType, VisionBuf
 from opendbc.car.car_helpers import get_demo_car_params
 from openpilot.common.swaglog import cloudlog
 from openpilot.common.params import Params
 from openpilot.common.filter_simple import FirstOrderFilter
 from openpilot.common.realtime import config_realtime_process
 from openpilot.common.transformations.camera import DEVICE_CAMERAS
 from openpilot.common.transformations.model import get_warp_matrix
 from openpilot.system import sentry
 from openpilot.selfdrive.controls.lib.desire_helper import DesireHelper
 from openpilot.sunnypilot.modeld.runners import ModelRunner, Runtime
 from openpilot.sunnypilot.modeld.parse_model_outputs import Parser
 from openpilot.sunnypilot.modeld.fill_model_msg import fill_model_msg, fill_pose_msg, PublishState
 from openpilot.sunnypilot.modeld.constants import ModelConstants
 from openpilot.sunnypilot.modeld.models.commonmodel_pyx import DrivingModelFrame, CLContext
 PROCESS_NAME = "sunnypilot.modeld.modeld"
 SEND_RAW_PRED = os.getenv('SEND_RAW_PRED')
 MODEL_PATHS = {
  ModelRunner.THNEED: Path(__file__).parent / 'models/supercombo.thneed',
  ModelRunner.ONNX: Path(__file__).parent / 'models/supercombo.onnx'}
 METADATA_PATH = Path(__file__).parent / 'models/supercombo_metadata.pkl'
 class FrameMeta:
  frame_id: int = 0
  timestamp_sof: int = 0
  timestamp_eof: int = 0
  def __init__(self, vipc=None):
    if vipc is not None:
      self.frame_id, self.timestamp_sof, self.timestamp_eof = vipc.frame_id, vipc.timestamp_sof, vipc.timestamp_eof
 class ModelState:
  frame: DrivingModelFrame
  wide_frame: DrivingModelFrame
  inputs: dict[str, np.ndarray]
  output: np.ndarray
  prev_desire: np.ndarray  # for tracking the rising edge of the pulse
  model: ModelRunner
  def __init__(self, context: CLContext):
    self.frame = DrivingModelFrame(context)
    self.wide_frame = DrivingModelFrame(context)
    self.prev_desire = np.zeros(ModelConstants.DESIRE_LEN, dtype=np.float32)
    self.full_features_20Hz = np.zeros((ModelConstants.FULL_HISTORY_BUFFER_LEN, ModelConstants.FEATURE_LEN), dtype=np.float32)
    self.desire_20Hz =  np.zeros((ModelConstants.FULL_HISTORY_BUFFER_LEN + 1, ModelConstants.DESIRE_LEN), dtype=np.float32)
    # img buffers are managed in openCL transform code
    self.inputs = {
      'desire': np.zeros(ModelConstants.DESIRE_LEN * (ModelConstants.HISTORY_BUFFER_LEN+1), dtype=np.float32),
      'traffic_convention': np.zeros(ModelConstants.TRAFFIC_CONVENTION_LEN, dtype=np.float32),
      'features_buffer': np.zeros(ModelConstants.HISTORY_BUFFER_LEN * ModelConstants.FEATURE_LEN, dtype=np.float32),
    }
    with open(METADATA_PATH, 'rb') as f:
      model_metadata = pickle.load(f)
    self.output_slices = model_metadata['output_slices']
    net_output_size = model_metadata['output_shapes']['outputs'][1]
    self.output = np.zeros(net_output_size, dtype=np.float32)
    self.parser = Parser()
    self.model = ModelRunner(MODEL_PATHS, self.output, Runtime.GPU, False, context)
    self.model.addInput("input_imgs", None)
    self.model.addInput("big_input_imgs", None)
    for k,v in self.inputs.items():
      self.model.addInput(k, v)
  def slice_outputs(self, model_outputs: np.ndarray) -> dict[str, np.ndarray]:
    parsed_model_outputs = {k: model_outputs[np.newaxis, v] for k,v in self.output_slices.items()}
    if SEND_RAW_PRED:
      parsed_model_outputs['raw_pred'] = model_outputs.copy()
    return parsed_model_outputs
  def run(self, buf: VisionBuf, wbuf: VisionBuf, transform: np.ndarray, transform_wide: np.ndarray,
                inputs: dict[str, np.ndarray], prepare_only: bool) -> dict[str, np.ndarray] | None:
    # Model decides when action is completed, so desire input is just a pulse triggered on rising edge
    inputs['desire'][0] = 0
    new_desire = np.where(inputs['desire'] - self.prev_desire > .99, inputs['desire'], 0)
    self.prev_desire[:] = inputs['desire']
    self.desire_20Hz[:-1] = self.desire_20Hz[1:]
    self.desire_20Hz[-1] = new_desire
    self.inputs['desire'][:] = self.desire_20Hz.reshape((25,4,-1)).max(axis=1).flatten()
    self.inputs['traffic_convention'][:] = inputs['traffic_convention']
    self.model.setInputBuffer("input_imgs", self.frame.prepare(buf, transform.flatten(), self.model.getCLBuffer("input_imgs")))
    self.model.setInputBuffer("big_input_imgs", self.wide_frame.prepare(wbuf, transform_wide.flatten(), self.model.getCLBuffer("big_input_imgs")))
    if prepare_only:
      return None
    self.model.execute()
    outputs = self.parser.parse_outputs(self.slice_outputs(self.output))
    self.full_features_20Hz[:-1] = self.full_features_20Hz[1:]
    self.full_features_20Hz[-1] = outputs['hidden_state'][0, :]
    idxs = np.arange(-4,-100,-4)[::-1]
    self.inputs['features_buffer'][:] = self.full_features_20Hz[idxs].flatten()
    return outputs
 def main(demo=False):
  cloudlog.warning("modeld init")
  sentry.set_tag("daemon", PROCESS_NAME)
  cloudlog.bind(daemon=PROCESS_NAME)
  setproctitle(PROCESS_NAME)
  config_realtime_process(7, 54)
  cloudlog.warning("setting up CL context")
  cl_context = CLContext()
  cloudlog.warning("CL context ready; loading model")
  model = ModelState(cl_context)
  cloudlog.warning("models loaded, modeld starting")
  # visionipc clients
  while True:
    available_streams = VisionIpcClient.available_streams("camerad", block=False)
    if available_streams:
      use_extra_client = VisionStreamType.VISION_STREAM_WIDE_ROAD in available_streams and VisionStreamType.VISION_STREAM_ROAD in available_streams
      main_wide_camera = VisionStreamType.VISION_STREAM_ROAD not in available_streams
      break
    time.sleep(.1)
  vipc_client_main_stream = VisionStreamType.VISION_STREAM_WIDE_ROAD if main_wide_camera else VisionStreamType.VISION_STREAM_ROAD
  vipc_client_main = VisionIpcClient("camerad", vipc_client_main_stream, True, cl_context)
  vipc_client_extra = VisionIpcClient("camerad", VisionStreamType.VISION_STREAM_WIDE_ROAD, False, cl_context)
  cloudlog.warning(f"vision stream set up, main_wide_camera: {main_wide_camera}, use_extra_client: {use_extra_client}")
  while not vipc_client_main.connect(False):
    time.sleep(0.1)
  while use_extra_client and not vipc_client_extra.connect(False):
    time.sleep(0.1)
  cloudlog.warning(f"connected main cam with buffer size: {vipc_client_main.buffer_len} ({vipc_client_main.width} x {vipc_client_main.height})")
  if use_extra_client:
    cloudlog.warning(f"connected extra cam with buffer size: {vipc_client_extra.buffer_len} ({vipc_client_extra.width} x {vipc_client_extra.height})")
  # messaging
  pm = PubMaster(["modelV2", "drivingModelData", "cameraOdometry"])
  sm = SubMaster(["deviceState", "carState", "roadCameraState", "liveCalibration", "driverMonitoringState", "carControl"])
  publish_state = PublishState()
  params = Params()
  # setup filter to track dropped frames
  frame_dropped_filter = FirstOrderFilter(0., 10., 1. / ModelConstants.MODEL_FREQ)
  frame_id = 0
  last_vipc_frame_id = 0
  run_count = 0
  model_transform_main = np.zeros((3, 3), dtype=np.float32)
  model_transform_extra = np.zeros((3, 3), dtype=np.float32)
  live_calib_seen = False
  buf_main, buf_extra = None, None
  meta_main = FrameMeta()
  meta_extra = FrameMeta()
  if demo:
    CP = get_demo_car_params()
  else:
    CP = messaging.log_from_bytes(params.get("CarParams", block=True), car.CarParams)
  cloudlog.info("modeld got CarParams: %s", CP.carName)
  # TODO this needs more thought, use .2s extra for now to estimate other delays
  steer_delay = CP.steerActuatorDelay + .2
  DH = DesireHelper()
  while True:
    # Keep receiving frames until we are at least 1 frame ahead of previous extra frame
    while meta_main.timestamp_sof < meta_extra.timestamp_sof + 25000000:
      buf_main = vipc_client_main.recv()
      meta_main = FrameMeta(vipc_client_main)
      if buf_main is None:
        break
    if buf_main is None:
      cloudlog.debug("vipc_client_main no frame")
      continue
    if use_extra_client:
      # Keep receiving extra frames until frame id matches main camera
      while True:
        buf_extra = vipc_client_extra.recv()
        meta_extra = FrameMeta(vipc_client_extra)
        if buf_extra is None or meta_main.timestamp_sof < meta_extra.timestamp_sof + 25000000:
          break
      if buf_extra is None:
        cloudlog.debug("vipc_client_extra no frame")
        continue
      if abs(meta_main.timestamp_sof - meta_extra.timestamp_sof) > 10000000:
        cloudlog.error(f"frames out of sync! main: {meta_main.frame_id} ({meta_main.timestamp_sof / 1e9:.5f}),\
                         extra: {meta_extra.frame_id} ({meta_extra.timestamp_sof / 1e9:.5f})")
    else:
      # Use single camera
      buf_extra = buf_main
      meta_extra = meta_main
    sm.update(0)
    desire = DH.desire
    is_rhd = sm["driverMonitoringState"].isRHD
    frame_id = sm["roadCameraState"].frameId
    v_ego = max(sm["carState"].vEgo, 0.)
    if sm.updated["liveCalibration"] and sm.seen['roadCameraState'] and sm.seen['deviceState']:
      device_from_calib_euler = np.array(sm["liveCalibration"].rpyCalib, dtype=np.float32)
      dc = DEVICE_CAMERAS[(str(sm['deviceState'].deviceType), str(sm['roadCameraState'].sensor))]
      model_transform_main = get_warp_matrix(device_from_calib_euler, dc.ecam.intrinsics if main_wide_camera else dc.fcam.intrinsics, False).astype(np.float32)
      model_transform_extra = get_warp_matrix(device_from_calib_euler, dc.ecam.intrinsics, True).astype(np.float32)
      live_calib_seen = True
    traffic_convention = np.zeros(2)
    traffic_convention[int(is_rhd)] = 1
    vec_desire = np.zeros(ModelConstants.DESIRE_LEN, dtype=np.float32)
    if desire >= 0 and desire < ModelConstants.DESIRE_LEN:
      vec_desire[desire] = 1
    # tracked dropped frames
    vipc_dropped_frames = max(0, meta_main.frame_id - last_vipc_frame_id - 1)
    frames_dropped = frame_dropped_filter.update(min(vipc_dropped_frames, 10))
    if run_count < 10: # let frame drops warm up
      frame_dropped_filter.x = 0.
      frames_dropped = 0.
    run_count = run_count + 1
    frame_drop_ratio = frames_dropped / (1 + frames_dropped)
    prepare_only = vipc_dropped_frames > 0
    if prepare_only:
      cloudlog.error(f"skipping model eval. Dropped {vipc_dropped_frames} frames")
    inputs:dict[str, np.ndarray] = {
      'desire': vec_desire,
      'traffic_convention': traffic_convention,
      }
    mt1 = time.perf_counter()
    model_output = model.run(buf_main, buf_extra, model_transform_main, model_transform_extra, inputs, prepare_only)
    mt2 = time.perf_counter()
    model_execution_time = mt2 - mt1
    if model_output is not None:
      modelv2_send = messaging.new_message('modelV2')
      drivingdata_send = messaging.new_message('drivingModelData')
      posenet_send = messaging.new_message('cameraOdometry')
      fill_model_msg(drivingdata_send, modelv2_send, model_output, v_ego, steer_delay,
                     publish_state, meta_main.frame_id, meta_extra.frame_id, frame_id,
                     frame_drop_ratio, meta_main.timestamp_eof, model_execution_time, live_calib_seen)
      desire_state = modelv2_send.modelV2.meta.desireState
      l_lane_change_prob = desire_state[log.Desire.laneChangeLeft]
      r_lane_change_prob = desire_state[log.Desire.laneChangeRight]
      lane_change_prob = l_lane_change_prob + r_lane_change_prob
      DH.update(sm['carState'], sm['carControl'].latActive, lane_change_prob)
      modelv2_send.modelV2.meta.laneChangeState = DH.lane_change_state
      modelv2_send.modelV2.meta.laneChangeDirection = DH.lane_change_direction
      drivingdata_send.drivingModelData.meta.laneChangeState = DH.lane_change_state
      drivingdata_send.drivingModelData.meta.laneChangeDirection = DH.lane_change_direction
      fill_pose_msg(posenet_send, model_output, meta_main.frame_id, vipc_dropped_frames, meta_main.timestamp_eof, live_calib_seen)
      pm.send('modelV2', modelv2_send)
      pm.send('drivingModelData', drivingdata_send)
      pm.send('cameraOdometry', posenet_send)
    last_vipc_frame_id = meta_main.frame_id
 if __name__ == "__main__":
  try:
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--demo', action='store_true', help='A boolean for demo mode.')
    args = parser.parse_args()
    main(demo=args.demo)
  except KeyboardInterrupt:
    cloudlog.warning(f"child {PROCESS_NAME} got SIGINT")
  except Exception:
    sentry.capture_exception()
    raise
@@ -0,0 +1,62 @@
 ## Neural networks in openpilot
 To view the architecture of the ONNX networks, you can use [netron](https://netron.app/)
 ## Supercombo
 ### Supercombo input format (Full size: 799906 x float32)
 * **image stream**
  * Two consecutive images (256 * 512 * 3 in RGB) recorded at 20 Hz : 393216 = 2 * 6 * 128 * 256
    * Each 256 * 512 image is represented in YUV420 with 6 channels : 6 * 128 * 256
      * Channels 0,1,2,3 represent the full-res Y channel and are represented in numpy as Y[::2, ::2], Y[::2, 1::2], Y[1::2, ::2], and Y[1::2, 1::2]
      * Channel 4 represents the half-res U channel
      * Channel 5 represents the half-res V channel
 * **wide image stream**
  * Two consecutive images (256 * 512 * 3 in RGB) recorded at 20 Hz : 393216 = 2 * 6 * 128 * 256
    * Each 256 * 512 image is represented in YUV420 with 6 channels : 6 * 128 * 256
      * Channels 0,1,2,3 represent the full-res Y channel and are represented in numpy as Y[::2, ::2], Y[::2, 1::2], Y[1::2, ::2], and Y[1::2, 1::2]
      * Channel 4 represents the half-res U channel
      * Channel 5 represents the half-res V channel
 * **desire**
  * one-hot encoded buffer to command model to execute certain actions, bit needs to be sent for the past 5 seconds (at 20FPS) : 100 * 8
 * **traffic convention**
  * one-hot encoded vector to tell model whether traffic is right-hand or left-hand traffic : 2
 * **feature buffer**
  * A buffer of intermediate features that gets appended to the current feature to form a 5 seconds temporal context (at 20FPS) : 99 * 512
 ### Supercombo output format (Full size: XXX x float32)
 Read [here](https://github.com/commaai/openpilot/blob/90af436a121164a51da9fa48d093c29f738adf6a/selfdrive/modeld/models/driving.h#L236) for more.
 ## Driver Monitoring Model
 * .onnx model can be run with onnx runtimes
 * .dlc file is a pre-quantized model and only runs on qualcomm DSPs
 ### input format
 * single image W = 1440 H = 960 luminance channel (Y) from the planar YUV420 format:
  * full input size is 1440 * 960 = 1382400
  * normalized ranging from 0.0 to 1.0 in float32 (onnx runner) or ranging from 0 to 255 in uint8 (snpe runner)
 * camera calibration angles (roll, pitch, yaw) from liveCalibration: 3 x float32 inputs
 ### output format
 * 84 x float32 outputs = 2 + 41 * 2 ([parsing example](https://github.com/commaai/openpilot/blob/22ce4e17ba0d3bfcf37f8255a4dd1dc683fe0c38/selfdrive/modeld/models/dmonitoring.cc#L33))
  * for each person in the front seats (2 * 41)
    * face pose: 12 = 6 + 6
      * face orientation [pitch, yaw, roll] in camera frame: 3
      * face position [dx, dy] relative to image center: 2
      * normalized face size: 1
      * standard deviations for above outputs: 6
    * face visible probability: 1
    * eyes: 20 = (8 + 1) + (8 + 1) + 1 + 1
      * eye position and size, and their standard deviations: 8
      * eye visible probability: 1
      * eye closed probability: 1
    * wearing sunglasses probability: 1
    * face occluded probability: 1
    * touching wheel probability: 1
    * paying attention probability: 1
    * (deprecated) distracted probabilities: 2
    * using phone probability: 1
    * distracted probability: 1
  * common outputs 2
    * poor camera vision probability: 1
    * left hand drive probability: 1
@@ -0,0 +1,69 @@
 #include "sunnypilot/modeld/models/commonmodel.h"
 #include <cmath>
 #include <cstring>
 #include "common/clutil.h"
 DrivingModelFrame::DrivingModelFrame(cl_device_id device_id, cl_context context) : ModelFrame(device_id, context) {
  input_frames = std::make_unique<uint8_t[]>(buf_size);
  //input_frames_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, buf_size, NULL, &err));
  img_buffer_20hz_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, 5*frame_size_bytes, NULL, &err));
  region.origin = 4 * frame_size_bytes;
  region.size = frame_size_bytes;
  last_img_cl = CL_CHECK_ERR(clCreateSubBuffer(img_buffer_20hz_cl, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, &region, &err));
  loadyuv_init(&loadyuv, context, device_id, MODEL_WIDTH, MODEL_HEIGHT);
  init_transform(device_id, context, MODEL_WIDTH, MODEL_HEIGHT);
 }
 uint8_t* DrivingModelFrame::prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection, cl_mem* output) {
  run_transform(yuv_cl, MODEL_WIDTH, MODEL_HEIGHT, frame_width, frame_height, frame_stride, frame_uv_offset, projection);
  for (int i = 0; i < 4; i++) {
    CL_CHECK(clEnqueueCopyBuffer(q, img_buffer_20hz_cl, img_buffer_20hz_cl, (i+1)*frame_size_bytes, i*frame_size_bytes, frame_size_bytes, 0, nullptr, nullptr));
  }
  loadyuv_queue(&loadyuv, q, y_cl, u_cl, v_cl, last_img_cl);
  if (output == NULL) {
    CL_CHECK(clEnqueueReadBuffer(q, img_buffer_20hz_cl, CL_TRUE, 0, frame_size_bytes, &input_frames[0], 0, nullptr, nullptr));
    CL_CHECK(clEnqueueReadBuffer(q, last_img_cl, CL_TRUE, 0, frame_size_bytes, &input_frames[MODEL_FRAME_SIZE], 0, nullptr, nullptr));
    clFinish(q);
    return &input_frames[0];
  } else {
    copy_queue(&loadyuv, q, img_buffer_20hz_cl, *output, 0, 0, frame_size_bytes);
    copy_queue(&loadyuv, q, last_img_cl, *output, 0, frame_size_bytes, frame_size_bytes);
    // NOTE: Since thneed is using a different command queue, this clFinish is needed to ensure the image is ready.
    clFinish(q);
    return NULL;
  }
 }
 DrivingModelFrame::~DrivingModelFrame() {
  deinit_transform();
  loadyuv_destroy(&loadyuv);
  CL_CHECK(clReleaseMemObject(img_buffer_20hz_cl));
  CL_CHECK(clReleaseMemObject(last_img_cl));
  CL_CHECK(clReleaseCommandQueue(q));
 }
 MonitoringModelFrame::MonitoringModelFrame(cl_device_id device_id, cl_context context) : ModelFrame(device_id, context) {
  input_frames = std::make_unique<uint8_t[]>(buf_size);
  //input_frame_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, buf_size, NULL, &err));
  init_transform(device_id, context, MODEL_WIDTH, MODEL_HEIGHT);
 }
 uint8_t* MonitoringModelFrame::prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection, cl_mem* output) {
  run_transform(yuv_cl, MODEL_WIDTH, MODEL_HEIGHT, frame_width, frame_height, frame_stride, frame_uv_offset, projection);
  CL_CHECK(clEnqueueReadBuffer(q, y_cl, CL_TRUE, 0, MODEL_FRAME_SIZE * sizeof(uint8_t), input_frames.get(), 0, nullptr, nullptr));
  clFinish(q);
  //return &y_cl;
  return input_frames.get();
 }
 MonitoringModelFrame::~MonitoringModelFrame() {
  deinit_transform();
  CL_CHECK(clReleaseCommandQueue(q));
 }
@@ -0,0 +1,98 @@
 #pragma once
 #include <cfloat>
 #include <cstdlib>
 #include <cassert>
 #include <memory>
 #define CL_USE_DEPRECATED_OPENCL_1_2_APIS
 #ifdef __APPLE__
 #include <OpenCL/cl.h>
 #else
 #include <CL/cl.h>
 #endif
 #include "common/mat.h"
 #include "selfdrive/modeld/transforms/loadyuv.h"
 #include "selfdrive/modeld/transforms/transform.h"
 class ModelFrame {
 public:
  ModelFrame(cl_device_id device_id, cl_context context) {
    q = CL_CHECK_ERR(clCreateCommandQueue(context, device_id, 0, &err));
  }
  virtual ~ModelFrame() {}
  virtual uint8_t* prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection, cl_mem* output) { return NULL; }
  /*
  uint8_t* buffer_from_cl(cl_mem *in_frames, int buffer_size) {
    CL_CHECK(clEnqueueReadBuffer(q, *in_frames, CL_TRUE, 0, buffer_size, input_frames.get(), 0, nullptr, nullptr));
    clFinish(q);
    return &input_frames[0];
  }
  */
  int MODEL_WIDTH;
  int MODEL_HEIGHT;
  int MODEL_FRAME_SIZE;
  int buf_size;
 protected:
  cl_mem y_cl, u_cl, v_cl;
  Transform transform;
  cl_command_queue q;
  std::unique_ptr<uint8_t[]> input_frames;
  void init_transform(cl_device_id device_id, cl_context context, int model_width, int model_height) {
    y_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, model_width * model_height, NULL, &err));
    u_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, (model_width / 2) * (model_height / 2), NULL, &err));
    v_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, (model_width / 2) * (model_height / 2), NULL, &err));
    transform_init(&transform, context, device_id);
  }
  void deinit_transform() {
    transform_destroy(&transform);
    CL_CHECK(clReleaseMemObject(v_cl));
    CL_CHECK(clReleaseMemObject(u_cl));
    CL_CHECK(clReleaseMemObject(y_cl));
  }
  void run_transform(cl_mem yuv_cl, int model_width, int model_height, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection) {
    transform_queue(&transform, q,
        yuv_cl, frame_width, frame_height, frame_stride, frame_uv_offset,
        y_cl, u_cl, v_cl, model_width, model_height, projection);
  }
 };
 class DrivingModelFrame : public ModelFrame {
 public:
  DrivingModelFrame(cl_device_id device_id, cl_context context);
  ~DrivingModelFrame();
  uint8_t* prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection, cl_mem* output);
  const int MODEL_WIDTH = 512;
  const int MODEL_HEIGHT = 256;
  const int MODEL_FRAME_SIZE = MODEL_WIDTH * MODEL_HEIGHT * 3 / 2;
  const int buf_size = MODEL_FRAME_SIZE * 2;
  const size_t frame_size_bytes = MODEL_FRAME_SIZE * sizeof(uint8_t);
 private:
  LoadYUVState loadyuv;
  cl_mem img_buffer_20hz_cl, last_img_cl;//, input_frames_cl;
  cl_buffer_region region;
 };
 class MonitoringModelFrame : public ModelFrame {
 public:
  MonitoringModelFrame(cl_device_id device_id, cl_context context);
  ~MonitoringModelFrame();
  uint8_t* prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection, cl_mem* output);
  const int MODEL_WIDTH = 1440;
  const int MODEL_HEIGHT = 960;
  const int MODEL_FRAME_SIZE = MODEL_WIDTH * MODEL_HEIGHT;
  const int buf_size = MODEL_FRAME_SIZE;
 private:
  // cl_mem input_frame_cl;
 };
@@ -0,0 +1,26 @@
 # distutils: language = c++
 from msgq.visionipc.visionipc cimport cl_device_id, cl_context, cl_mem
 cdef extern from "common/mat.h":
  cdef struct mat3:
    float v[9]
 cdef extern from "common/clutil.h":
  cdef unsigned long CL_DEVICE_TYPE_DEFAULT
  cl_device_id cl_get_device_id(unsigned long)
  cl_context cl_create_context(cl_device_id)
 cdef extern from "sunnypilot/modeld/models/commonmodel.h":
  cppclass ModelFrame:
    int buf_size
    # unsigned char * buffer_from_cl(cl_mem*, int);
    unsigned char * prepare(cl_mem, int, int, int, int, mat3, cl_mem*)
  cppclass DrivingModelFrame:
    int buf_size
    DrivingModelFrame(cl_device_id, cl_context)
  cppclass MonitoringModelFrame:
    int buf_size
    MonitoringModelFrame(cl_device_id, cl_context)
@@ -0,0 +1,13 @@
 # distutils: language = c++
 from msgq.visionipc.visionipc cimport cl_mem
 from msgq.visionipc.visionipc_pyx cimport CLContext as BaseCLContext
 cdef class CLContext(BaseCLContext):
  pass
 cdef class CLMem:
  cdef cl_mem * mem
  @staticmethod
  cdef create(void*)
@@ -0,0 +1,76 @@
 # distutils: language = c++
 # cython: c_string_encoding=ascii, language_level=3
 import numpy as np
 cimport numpy as cnp
 from libc.string cimport memcpy
 from libc.stdint cimport uintptr_t
 from msgq.visionipc.visionipc cimport cl_mem
 from msgq.visionipc.visionipc_pyx cimport VisionBuf, CLContext as BaseCLContext
 from sunnypilot.modeld.models.commonmodel cimport CL_DEVICE_TYPE_DEFAULT, cl_get_device_id, cl_create_context
 from sunnypilot.modeld.models.commonmodel cimport mat3, ModelFrame as cppModelFrame, DrivingModelFrame as cppDrivingModelFrame, MonitoringModelFrame as cppMonitoringModelFrame
 cdef class CLContext(BaseCLContext):
  def __cinit__(self):
    self.device_id = cl_get_device_id(CL_DEVICE_TYPE_DEFAULT)
    self.context = cl_create_context(self.device_id)
 cdef class CLMem:
  @staticmethod
  cdef create(void * cmem):
    mem = CLMem()
    mem.mem = <cl_mem*> cmem
    return mem
  @property
  def mem_address(self):
    return <uintptr_t>(self.mem)
 def cl_from_visionbuf(VisionBuf buf):
  return CLMem.create(<void*>&buf.buf.buf_cl)
 cdef class ModelFrame:
  cdef cppModelFrame * frame
  cdef int buf_size
  def __dealloc__(self):
    del self.frame
  def prepare(self, VisionBuf buf, float[:] projection, CLMem output):
    cdef mat3 cprojection
    memcpy(cprojection.v, &projection[0], 9*sizeof(float))
    cdef unsigned char * data
    if output is None:
      data = self.frame.prepare(buf.buf.buf_cl, buf.width, buf.height, buf.stride, buf.uv_offset, cprojection, NULL)
    else:
      data = self.frame.prepare(buf.buf.buf_cl, buf.width, buf.height, buf.stride, buf.uv_offset, cprojection, output.mem)
    if not data:
      return None
    return np.asarray(<cnp.uint8_t[:self.buf_size]> data)
    # return CLMem.create(data)
  # def buffer_from_cl(self, CLMem in_frames):
  #   cdef unsigned char * data2
  #   data2 = self.frame.buffer_from_cl(in_frames.mem, self.buf_size)
  #   return np.asarray(<cnp.uint8_t[:self.buf_size]> data2)
 cdef class DrivingModelFrame(ModelFrame):
  cdef cppDrivingModelFrame * _frame
  def __cinit__(self, CLContext context):
    self._frame = new cppDrivingModelFrame(context.device_id, context.context)
    self.frame = <cppModelFrame*>(self._frame)
    self.buf_size = self._frame.buf_size
 cdef class MonitoringModelFrame(ModelFrame):
  cdef cppMonitoringModelFrame * _frame
  def __cinit__(self, CLContext context):
    self._frame = new cppMonitoringModelFrame(context.device_id, context.context)
    self.frame = <cppModelFrame*>(self._frame)
    self.buf_size = self._frame.buf_size
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:0c896681fd6851de3968433e12f37834429eba265e938cf383200be3e5835cec
 size 49096168
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:af2cb689ec9e31292f759b561e70e4558a38f778558dff39ccff460ccafc0d52
 size 49849624
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:441f2865017c07ee0dfb2488c5d86aab00df7ff5c5ec163959f35c33d74b65e6
 size 594
@@ -0,0 +1,103 @@
 import numpy as np
 from openpilot.sunnypilot.modeld.constants import ModelConstants
 def safe_exp(x, out=None):
  # -11 is around 10**14, more causes float16 overflow
  return np.exp(np.clip(x, -np.inf, 11), out=out)
 def sigmoid(x):
  return 1. / (1. + safe_exp(-x))
 def softmax(x, axis=-1):
  x -= np.max(x, axis=axis, keepdims=True)
  if x.dtype == np.float32 or x.dtype == np.float64:
    safe_exp(x, out=x)
  else:
    x = safe_exp(x)
  x /= np.sum(x, axis=axis, keepdims=True)
  return x
 class Parser:
  def __init__(self, ignore_missing=False):
    self.ignore_missing = ignore_missing
  def check_missing(self, outs, name):
    if name not in outs and not self.ignore_missing:
      raise ValueError(f"Missing output {name}")
    return name not in outs
  def parse_categorical_crossentropy(self, name, outs, out_shape=None):
    if self.check_missing(outs, name):
      return
    raw = outs[name]
    if out_shape is not None:
      raw = raw.reshape((raw.shape[0],) + out_shape)
    outs[name] = softmax(raw, axis=-1)
  def parse_binary_crossentropy(self, name, outs):
    if self.check_missing(outs, name):
      return
    raw = outs[name]
    outs[name] = sigmoid(raw)
  def parse_mdn(self, name, outs, in_N=0, out_N=1, out_shape=None):
    if self.check_missing(outs, name):
      return
    raw = outs[name]
    raw = raw.reshape((raw.shape[0], max(in_N, 1), -1))
    n_values = (raw.shape[2] - out_N)//2
    pred_mu = raw[:,:,:n_values]
    pred_std = safe_exp(raw[:,:,n_values: 2*n_values])
    if in_N > 1:
      weights = np.zeros((raw.shape[0], in_N, out_N), dtype=raw.dtype)
      for i in range(out_N):
        weights[:,:,i - out_N] = softmax(raw[:,:,i - out_N], axis=-1)
      if out_N == 1:
        for fidx in range(weights.shape[0]):
          idxs = np.argsort(weights[fidx][:,0])[::-1]
          weights[fidx] = weights[fidx][idxs]
          pred_mu[fidx] = pred_mu[fidx][idxs]
          pred_std[fidx] = pred_std[fidx][idxs]
      full_shape = tuple([raw.shape[0], in_N] + list(out_shape))
      outs[name + '_weights'] = weights
      outs[name + '_hypotheses'] = pred_mu.reshape(full_shape)
      outs[name + '_stds_hypotheses'] = pred_std.reshape(full_shape)
      pred_mu_final = np.zeros((raw.shape[0], out_N, n_values), dtype=raw.dtype)
      pred_std_final = np.zeros((raw.shape[0], out_N, n_values), dtype=raw.dtype)
      for fidx in range(weights.shape[0]):
        for hidx in range(out_N):
          idxs = np.argsort(weights[fidx,:,hidx])[::-1]
          pred_mu_final[fidx, hidx] = pred_mu[fidx, idxs[0]]
          pred_std_final[fidx, hidx] = pred_std[fidx, idxs[0]]
    else:
      pred_mu_final = pred_mu
      pred_std_final = pred_std
    if out_N > 1:
      final_shape = tuple([raw.shape[0], out_N] + list(out_shape))
    else:
      final_shape = tuple([raw.shape[0],] + list(out_shape))
    outs[name] = pred_mu_final.reshape(final_shape)
    outs[name + '_stds'] = pred_std_final.reshape(final_shape)
  def parse_outputs(self, outs: dict[str, np.ndarray]) -> dict[str, np.ndarray]:
    self.parse_mdn('plan', outs, in_N=ModelConstants.PLAN_MHP_N, out_N=ModelConstants.PLAN_MHP_SELECTION,
                   out_shape=(ModelConstants.IDX_N,ModelConstants.PLAN_WIDTH))
    self.parse_mdn('lane_lines', outs, in_N=0, out_N=0, out_shape=(ModelConstants.NUM_LANE_LINES,ModelConstants.IDX_N,ModelConstants.LANE_LINES_WIDTH))
    self.parse_mdn('road_edges', outs, in_N=0, out_N=0, out_shape=(ModelConstants.NUM_ROAD_EDGES,ModelConstants.IDX_N,ModelConstants.LANE_LINES_WIDTH))
    self.parse_mdn('pose', outs, in_N=0, out_N=0, out_shape=(ModelConstants.POSE_WIDTH,))
    self.parse_mdn('road_transform', outs, in_N=0, out_N=0, out_shape=(ModelConstants.POSE_WIDTH,))
    self.parse_mdn('wide_from_device_euler', outs, in_N=0, out_N=0, out_shape=(ModelConstants.WIDE_FROM_DEVICE_WIDTH,))
    self.parse_mdn('lead', outs, in_N=ModelConstants.LEAD_MHP_N, out_N=ModelConstants.LEAD_MHP_SELECTION,
                   out_shape=(ModelConstants.LEAD_TRAJ_LEN,ModelConstants.LEAD_WIDTH))
    if 'lat_planner_solution' in outs:
      self.parse_mdn('lat_planner_solution', outs, in_N=0, out_N=0, out_shape=(ModelConstants.IDX_N,ModelConstants.LAT_PLANNER_SOLUTION_WIDTH))
    for k in ['lead_prob', 'lane_lines_prob', 'meta']:
      self.parse_binary_crossentropy(k, outs)
    self.parse_categorical_crossentropy('desire_state', outs, out_shape=(ModelConstants.DESIRE_PRED_WIDTH,))
    self.parse_categorical_crossentropy('desire_pred', outs, out_shape=(ModelConstants.DESIRE_PRED_LEN,ModelConstants.DESIRE_PRED_WIDTH))
    return outs
@@ -0,0 +1,27 @@
 import os
 from openpilot.system.hardware import TICI
 from openpilot.sunnypilot.modeld.runners.runmodel_pyx import RunModel, Runtime
 assert Runtime
 USE_THNEED = int(os.getenv('USE_THNEED', str(int(TICI))))
 USE_SNPE = int(os.getenv('USE_SNPE', str(int(TICI))))
 class ModelRunner(RunModel):
  THNEED = 'THNEED'
  SNPE = 'SNPE'
  ONNX = 'ONNX'
  def __new__(cls, paths, *args, **kwargs):
    if ModelRunner.THNEED in paths and USE_THNEED:
      from openpilot.sunnypilot.modeld.runners.thneedmodel_pyx import ThneedModel as Runner
      runner_type = ModelRunner.THNEED
    elif ModelRunner.SNPE in paths and USE_SNPE:
      from openpilot.sunnypilot.modeld.runners.snpemodel_pyx import SNPEModel as Runner
      runner_type = ModelRunner.SNPE
    elif ModelRunner.ONNX in paths:
      from openpilot.sunnypilot.modeld.runners.onnxmodel import ONNXModel as Runner
      runner_type = ModelRunner.ONNX
    else:
      raise Exception("Couldn't select a model runner, make sure to pass at least one valid model path")
    return Runner(str(paths[runner_type]), *args, **kwargs)
@@ -0,0 +1,71 @@
 import os
 import onnx
 import sys
 import numpy as np
 from typing import Any
 from openpilot.sunnypilot.modeld.runners.runmodel_pyx import RunModel
 from openpilot.sunnypilot.modeld.runners.ort_helpers import convert_fp16_to_fp32, ORT_TYPES_TO_NP_TYPES
 def create_ort_session(path, fp16_to_fp32):
  os.environ["OMP_NUM_THREADS"] = "4"
  os.environ["OMP_WAIT_POLICY"] = "PASSIVE"
  import onnxruntime as ort
  print("Onnx available providers: ", ort.get_available_providers(), file=sys.stderr)
  options = ort.SessionOptions()
  options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_DISABLE_ALL
  provider: str | tuple[str, dict[Any, Any]]
  if 'OpenVINOExecutionProvider' in ort.get_available_providers() and 'ONNXCPU' not in os.environ:
    provider = 'OpenVINOExecutionProvider'
  elif 'CUDAExecutionProvider' in ort.get_available_providers() and 'ONNXCPU' not in os.environ:
    options.intra_op_num_threads = 2
    provider = ('CUDAExecutionProvider', {'cudnn_conv_algo_search': 'EXHAUSTIVE'})
  else:
    options.intra_op_num_threads = 2
    options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
    options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
    provider = 'CPUExecutionProvider'
  model_data = convert_fp16_to_fp32(onnx.load(path)) if fp16_to_fp32 else path
  print("Onnx selected provider: ", [provider], file=sys.stderr)
  ort_session = ort.InferenceSession(model_data, options, providers=[provider])
  print("Onnx using ", ort_session.get_providers(), file=sys.stderr)
  return ort_session
 class ONNXModel(RunModel):
  def __init__(self, path, output, runtime, use_tf8, cl_context):
    self.inputs = {}
    self.output = output
    self.session = create_ort_session(path, fp16_to_fp32=True)
    self.input_names = [x.name for x in self.session.get_inputs()]
    self.input_shapes = {x.name: [1, *x.shape[1:]] for x in self.session.get_inputs()}
    self.input_dtypes = {x.name: ORT_TYPES_TO_NP_TYPES[x.type] for x in self.session.get_inputs()}
    # run once to initialize CUDA provider
    if "CUDAExecutionProvider" in self.session.get_providers():
      self.session.run(None, {k: np.zeros(self.input_shapes[k], dtype=self.input_dtypes[k]) for k in self.input_names})
    print("ready to run onnx model", self.input_shapes, file=sys.stderr)
  def addInput(self, name, buffer):
    assert name in self.input_names
    self.inputs[name] = buffer
  def setInputBuffer(self, name, buffer):
    assert name in self.inputs
    self.inputs[name] = buffer
  def getCLBuffer(self, name):
    return None
  def execute(self):
    inputs = {k: v.view(self.input_dtypes[k]) for k,v in self.inputs.items()}
    inputs = {k: v.reshape(self.input_shapes[k]).astype(self.input_dtypes[k]) for k,v in inputs.items()}
    outputs = self.session.run(None, inputs)
    assert len(outputs) == 1, "Only single model outputs are supported"
    self.output[:] = outputs[0]
    return self.output
@@ -0,0 +1,36 @@
 import onnx
 import onnxruntime as ort
 import numpy as np
 import itertools
 ORT_TYPES_TO_NP_TYPES = {'tensor(float16)': np.float16, 'tensor(float)': np.float32, 'tensor(uint8)': np.uint8}
 def attributeproto_fp16_to_fp32(attr):
  float32_list = np.frombuffer(attr.raw_data, dtype=np.float16)
  attr.data_type = 1
  attr.raw_data = float32_list.astype(np.float32).tobytes()
 def convert_fp16_to_fp32(model):
  for i in model.graph.initializer:
    if i.data_type == 10:
      attributeproto_fp16_to_fp32(i)
  for i in itertools.chain(model.graph.input, model.graph.output):
    if i.type.tensor_type.elem_type == 10:
      i.type.tensor_type.elem_type = 1
  for i in model.graph.node:
    if i.op_type == 'Cast' and i.attribute[0].i == 10:
      i.attribute[0].i = 1
    for a in i.attribute:
      if hasattr(a, 't'):
        if a.t.data_type == 10:
          attributeproto_fp16_to_fp32(a.t)
  return model.SerializeToString()
 def make_onnx_cpu_runner(model_path):
  options = ort.SessionOptions()
  options.intra_op_num_threads = 4
  options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
  options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
  model_data = convert_fp16_to_fp32(onnx.load(model_path))
  return ort.InferenceSession(model_data,  options, providers=['CPUExecutionProvider'])
@@ -0,0 +1,4 @@
 #pragma once
 #include "sunnypilot/modeld/runners/runmodel.h"
 #include "sunnypilot/modeld/runners/snpemodel.h"
@@ -0,0 +1,49 @@
 #pragma once
 #include <string>
 #include <vector>
 #include <memory>
 #include <cassert>
 #include "common/clutil.h"
 #include "common/swaglog.h"
 #define USE_CPU_RUNTIME 0
 #define USE_GPU_RUNTIME 1
 #define USE_DSP_RUNTIME 2
 struct ModelInput {
  const std::string name;
  float *buffer;
  int size;
  ModelInput(const std::string _name, float *_buffer, int _size) : name(_name), buffer(_buffer), size(_size) {}
  virtual void setBuffer(float *_buffer, int _size) {
    assert(size == _size || size == 0);
    buffer = _buffer;
    size = _size;
  }
 };
 class RunModel {
 public:
  std::vector<std::unique_ptr<ModelInput>> inputs;
  virtual ~RunModel() {}
  virtual void execute() {}
  virtual void* getCLBuffer(const std::string name) { return nullptr; }
  virtual void addInput(const std::string name, float *buffer, int size) {
    inputs.push_back(std::unique_ptr<ModelInput>(new ModelInput(name, buffer, size)));
  }
  virtual void setInputBuffer(const std::string name, float *buffer, int size) {
    for (auto &input : inputs) {
      if (name == input->name) {
        input->setBuffer(buffer, size);
        return;
      }
    }
    LOGE("Tried to update input `%s` but no input with this name exists", name.c_str());
    assert(false);
  }
 };
@@ -0,0 +1,14 @@
 # distutils: language = c++
 from libcpp.string cimport string
 cdef extern from "sunnypilot/modeld/runners/runmodel.h":
  cdef int USE_CPU_RUNTIME
  cdef int USE_GPU_RUNTIME
  cdef int USE_DSP_RUNTIME
  cdef cppclass RunModel:
    void addInput(string, float*, int)
    void setInputBuffer(string, float*, int)
    void * getCLBuffer(string)
    void execute()
@@ -0,0 +1,6 @@
 # distutils: language = c++
 from .runmodel cimport RunModel as cppRunModel
 cdef class RunModel:
  cdef cppRunModel * model
@@ -0,0 +1,37 @@
 # distutils: language = c++
 # cython: c_string_encoding=ascii, language_level=3
 from libcpp.string cimport string
 from .runmodel cimport USE_CPU_RUNTIME, USE_GPU_RUNTIME, USE_DSP_RUNTIME
 from sunnypilot.modeld.models.commonmodel_pyx cimport CLMem
 class Runtime:
  CPU = USE_CPU_RUNTIME
  GPU = USE_GPU_RUNTIME
  DSP = USE_DSP_RUNTIME
 cdef class RunModel:
  def __dealloc__(self):
    del self.model
  def addInput(self, string name, float[:] buffer):
    if buffer is not None:
      self.model.addInput(name, &buffer[0], len(buffer))
    else:
      self.model.addInput(name, NULL, 0)
  def setInputBuffer(self, string name, float[:] buffer):
    if buffer is not None:
      self.model.setInputBuffer(name, &buffer[0], len(buffer))
    else:
      self.model.setInputBuffer(name, NULL, 0)
  def getCLBuffer(self, string name):
    cdef void * cl_buf = self.model.getCLBuffer(name)
    if not cl_buf:
      return None
    return CLMem.create(cl_buf)
  def execute(self):
    self.model.execute()
@@ -0,0 +1,116 @@
 #pragma clang diagnostic ignored "-Wexceptions"
 #include "sunnypilot/modeld/runners/snpemodel.h"
 #include <cstring>
 #include <memory>
 #include <string>
 #include <utility>
 #include <vector>
 #include "common/util.h"
 #include "common/timing.h"
 void PrintErrorStringAndExit() {
  std::cerr << zdl::DlSystem::getLastErrorString() << std::endl;
  std::exit(EXIT_FAILURE);
 }
 SNPEModel::SNPEModel(const std::string path, float *_output, size_t _output_size, int runtime, bool _use_tf8, cl_context context) {
  output = _output;
  output_size = _output_size;
  use_tf8 = _use_tf8;
 #ifdef QCOM2
  if (runtime == USE_GPU_RUNTIME) {
    snpe_runtime = zdl::DlSystem::Runtime_t::GPU;
  } else if (runtime == USE_DSP_RUNTIME) {
    snpe_runtime = zdl::DlSystem::Runtime_t::DSP;
  } else {
    snpe_runtime = zdl::DlSystem::Runtime_t::CPU;
  }
  assert(zdl::SNPE::SNPEFactory::isRuntimeAvailable(snpe_runtime));
 #endif
  model_data = util::read_file(path);
  assert(model_data.size() > 0);
  // load model
  std::unique_ptr<zdl::DlContainer::IDlContainer> container = zdl::DlContainer::IDlContainer::open((uint8_t*)model_data.data(), model_data.size());
  if (!container) { PrintErrorStringAndExit(); }
  LOGW("loaded model with size: %lu", model_data.size());
  // create model runner
  zdl::SNPE::SNPEBuilder snpe_builder(container.get());
  while (!snpe) {
 #ifdef QCOM2
    snpe = snpe_builder.setOutputLayers({})
                       .setRuntimeProcessor(snpe_runtime)
                       .setUseUserSuppliedBuffers(true)
                       .setPerformanceProfile(zdl::DlSystem::PerformanceProfile_t::HIGH_PERFORMANCE)
                       .build();
 #else
    snpe = snpe_builder.setOutputLayers({})
                       .setUseUserSuppliedBuffers(true)
                       .setPerformanceProfile(zdl::DlSystem::PerformanceProfile_t::HIGH_PERFORMANCE)
                       .build();
 #endif
    if (!snpe) std::cerr << zdl::DlSystem::getLastErrorString() << std::endl;
  }
  // create output buffer
  zdl::DlSystem::UserBufferEncodingFloat ub_encoding_float;
  zdl::DlSystem::IUserBufferFactory &ub_factory = zdl::SNPE::SNPEFactory::getUserBufferFactory();
  const auto &output_tensor_names_opt = snpe->getOutputTensorNames();
  if (!output_tensor_names_opt) throw std::runtime_error("Error obtaining output tensor names");
  const auto &output_tensor_names = *output_tensor_names_opt;
  assert(output_tensor_names.size() == 1);
  const char *output_tensor_name = output_tensor_names.at(0);
  const zdl::DlSystem::TensorShape &buffer_shape = snpe->getInputOutputBufferAttributes(output_tensor_name)->getDims();
  if (output_size != 0) {
    assert(output_size == buffer_shape[1]);
  } else {
    output_size = buffer_shape[1];
  }
  std::vector<size_t> output_strides = {output_size * sizeof(float), sizeof(float)};
  output_buffer = ub_factory.createUserBuffer(output, output_size * sizeof(float), output_strides, &ub_encoding_float);
  output_map.add(output_tensor_name, output_buffer.get());
 }
 void SNPEModel::addInput(const std::string name, float *buffer, int size) {
  const int idx = inputs.size();
  const auto &input_tensor_names_opt = snpe->getInputTensorNames();
  if (!input_tensor_names_opt) throw std::runtime_error("Error obtaining input tensor names");
  const auto &input_tensor_names = *input_tensor_names_opt;
  const char *input_tensor_name = input_tensor_names.at(idx);
  const bool input_tf8 = use_tf8 && strcmp(input_tensor_name, "input_img") == 0;  // TODO: This is a terrible hack, get rid of this name check both here and in onnx_runner.py
  LOGW("adding index %d: %s", idx, input_tensor_name);
  zdl::DlSystem::UserBufferEncodingFloat ub_encoding_float;
  zdl::DlSystem::UserBufferEncodingTf8 ub_encoding_tf8(0, 1./255); // network takes 0-1
  zdl::DlSystem::IUserBufferFactory &ub_factory = zdl::SNPE::SNPEFactory::getUserBufferFactory();
  zdl::DlSystem::UserBufferEncoding *input_encoding = input_tf8 ? (zdl::DlSystem::UserBufferEncoding*)&ub_encoding_tf8 : (zdl::DlSystem::UserBufferEncoding*)&ub_encoding_float;
  const auto &buffer_shape_opt = snpe->getInputDimensions(input_tensor_name);
  const zdl::DlSystem::TensorShape &buffer_shape = *buffer_shape_opt;
  size_t size_of_input = input_tf8 ? sizeof(uint8_t) : sizeof(float);
  std::vector<size_t> strides(buffer_shape.rank());
  strides[strides.size() - 1] = size_of_input;
  size_t product = 1;
  for (size_t i = 0; i < buffer_shape.rank(); i++) product *= buffer_shape[i];
  size_t stride = strides[strides.size() - 1];
  for (size_t i = buffer_shape.rank() - 1; i > 0; i--) {
    stride *= buffer_shape[i];
    strides[i-1] = stride;
  }
  auto input_buffer = ub_factory.createUserBuffer(buffer, product*size_of_input, strides, input_encoding);
  input_map.add(input_tensor_name, input_buffer.get());
  inputs.push_back(std::unique_ptr<SNPEModelInput>(new SNPEModelInput(name, buffer, size, std::move(input_buffer))));
 }
 void SNPEModel::execute() {
  if (!snpe->execute(input_map, output_map)) {
    PrintErrorStringAndExit();
  }
 }
@@ -0,0 +1,52 @@
 #pragma once
 #pragma clang diagnostic ignored "-Wdeprecated-declarations"
 #include <memory>
 #include <string>
 #include <utility>
 #include <DlContainer/IDlContainer.hpp>
 #include <DlSystem/DlError.hpp>
 #include <DlSystem/ITensor.hpp>
 #include <DlSystem/ITensorFactory.hpp>
 #include <DlSystem/IUserBuffer.hpp>
 #include <DlSystem/IUserBufferFactory.hpp>
 #include <SNPE/SNPE.hpp>
 #include <SNPE/SNPEBuilder.hpp>
 #include <SNPE/SNPEFactory.hpp>
 #include "sunnypilot/modeld/runners/runmodel.h"
 struct SNPEModelInput : public ModelInput {
  std::unique_ptr<zdl::DlSystem::IUserBuffer> snpe_buffer;
  SNPEModelInput(const std::string _name, float *_buffer, int _size, std::unique_ptr<zdl::DlSystem::IUserBuffer> _snpe_buffer) : ModelInput(_name, _buffer, _size), snpe_buffer(std::move(_snpe_buffer)) {}
  void setBuffer(float *_buffer, int _size) {
    ModelInput::setBuffer(_buffer, _size);
    assert(snpe_buffer->setBufferAddress(_buffer) == true);
  }
 };
 class SNPEModel : public RunModel {
 public:
  SNPEModel(const std::string path, float *_output, size_t _output_size, int runtime, bool use_tf8 = false, cl_context context = NULL);
  void addInput(const std::string name, float *buffer, int size);
  void execute();
 private:
  std::string model_data;
 #ifdef QCOM2
  zdl::DlSystem::Runtime_t snpe_runtime;
 #endif
  // snpe model stuff
  std::unique_ptr<zdl::SNPE::SNPE> snpe;
  zdl::DlSystem::UserBufferMap input_map;
  zdl::DlSystem::UserBufferMap output_map;
  std::unique_ptr<zdl::DlSystem::IUserBuffer> output_buffer;
  bool use_tf8;
  float *output;
  size_t output_size;
 };
@@ -0,0 +1,9 @@
 # distutils: language = c++
 from libcpp.string cimport string
 from msgq.visionipc.visionipc cimport cl_context
 cdef extern from "sunnypilot/modeld/runners/snpemodel.h":
  cdef cppclass SNPEModel:
    SNPEModel(string, float*, size_t, int, bool, cl_context)
@@ -0,0 +1,17 @@
 # distutils: language = c++
 # cython: c_string_encoding=ascii, language_level=3
 import os
 from libcpp cimport bool
 from libcpp.string cimport string
 from .snpemodel cimport SNPEModel as cppSNPEModel
 from sunnypilot.modeld.models.commonmodel_pyx cimport CLContext
 from sunnypilot.modeld.runners.runmodel_pyx cimport RunModel
 from sunnypilot.modeld.runners.runmodel cimport RunModel as cppRunModel
 os.environ['ADSP_LIBRARY_PATH'] = "/data/pythonpath/third_party/snpe/dsp/"
 cdef class SNPEModel(RunModel):
  def __cinit__(self, string path, float[:] output, int runtime, bool use_tf8, CLContext context):
    self.model = <cppRunModel *> new cppSNPEModel(path, &output[0], len(output), runtime, use_tf8, context.context)
@@ -0,0 +1,58 @@
 #include "sunnypilot/modeld/runners/thneedmodel.h"
 #include <string>
 #include "common/swaglog.h"
 ThneedModel::ThneedModel(const std::string path, float *_output, size_t _output_size, int runtime, bool luse_tf8, cl_context context) {
  thneed = new Thneed(true, context);
  thneed->load(path.c_str());
  thneed->clexec();
  recorded = false;
  output = _output;
 }
 void* ThneedModel::getCLBuffer(const std::string name) {
  int index = -1;
  for (int i = 0; i < inputs.size(); i++) {
    if (name == inputs[i]->name) {
      index = i;
      break;
    }
  }
  if (index == -1) {
    LOGE("Tried to get CL buffer for input `%s` but no input with this name exists", name.c_str());
    assert(false);
  }
  if (thneed->input_clmem.size() >= inputs.size()) {
    return &thneed->input_clmem[inputs.size() - index - 1];
  } else {
    return nullptr;
  }
 }
 void ThneedModel::execute() {
  if (!recorded) {
    thneed->record = true;
    float *input_buffers[inputs.size()];
    for (int i = 0; i < inputs.size(); i++) {
      input_buffers[inputs.size() - i - 1] = inputs[i]->buffer;
    }
    thneed->copy_inputs(input_buffers);
    thneed->clexec();
    thneed->copy_output(output);
    thneed->stop();
    recorded = true;
  } else {
    float *input_buffers[inputs.size()];
    for (int i = 0; i < inputs.size(); i++) {
      input_buffers[inputs.size() - i - 1] = inputs[i]->buffer;
    }
    thneed->execute(input_buffers, output);
  }
 }
@@ -0,0 +1,17 @@
 #pragma once
 #include <string>
 #include "sunnypilot/modeld/runners/runmodel.h"
 #include "sunnypilot/modeld/thneed/thneed.h"
 class ThneedModel : public RunModel {
 public:
  ThneedModel(const std::string path, float *_output, size_t _output_size, int runtime, bool use_tf8 = false, cl_context context = NULL);
  void *getCLBuffer(const std::string name);
  void execute();
 private:
  Thneed *thneed = NULL;
  bool recorded;
  float *output;
 };
@@ -0,0 +1,9 @@
 # distutils: language = c++
 from libcpp.string cimport string
 from msgq.visionipc.visionipc cimport cl_context
 cdef extern from "sunnypilot/modeld/runners/thneedmodel.h":
  cdef cppclass ThneedModel:
    ThneedModel(string, float*, size_t, int, bool, cl_context)
@@ -0,0 +1,14 @@
 # distutils: language = c++
 # cython: c_string_encoding=ascii, language_level=3
 from libcpp cimport bool
 from libcpp.string cimport string
 from .thneedmodel cimport ThneedModel as cppThneedModel
 from sunnypilot.modeld.models.commonmodel_pyx cimport CLContext
 from sunnypilot.modeld.runners.runmodel_pyx cimport RunModel
 from sunnypilot.modeld.runners.runmodel cimport RunModel as cppRunModel
 cdef class ThneedModel(RunModel):
  def __cinit__(self, string path, float[:] output, int runtime, bool use_tf8, CLContext context):
    self.model = <cppRunModel *> new cppThneedModel(path, &output[0], len(output), runtime, use_tf8, context.context)
@@ -0,0 +1,8 @@
 thneed is an SNPE accelerator. I know SNPE is already an accelerator, but sometimes things need to go even faster..
 It runs on the local device, and caches a single model run. Then it replays it, but fast.
 thneed slices through abstraction layers like a fish.
 You need a thneed.
@@ -0,0 +1,154 @@
 #include <cassert>
 #include <set>
 #include "third_party/json11/json11.hpp"
 #include "common/util.h"
 #include "common/clutil.h"
 #include "common/swaglog.h"
 #include "sunnypilot/modeld/thneed/thneed.h"
 using namespace json11;
 extern map<cl_program, string> g_program_source;
 void Thneed::load(const char *filename) {
  LOGD("Thneed::load: loading from %s\n", filename);
  string buf = util::read_file(filename);
  int jsz = *(int *)buf.data();
  string jsonerr;
  string jj(buf.data() + sizeof(int), jsz);
  Json jdat = Json::parse(jj, jsonerr);
  map<cl_mem, cl_mem> real_mem;
  real_mem[NULL] = NULL;
  int ptr = sizeof(int)+jsz;
  for (auto &obj : jdat["objects"].array_items()) {
    auto mobj = obj.object_items();
    int sz = mobj["size"].int_value();
    cl_mem clbuf = NULL;
    if (mobj["buffer_id"].string_value().size() > 0) {
      // image buffer must already be allocated
      clbuf = real_mem[*(cl_mem*)(mobj["buffer_id"].string_value().data())];
      assert(mobj["needs_load"].bool_value() == false);
    } else {
      if (mobj["needs_load"].bool_value()) {
        clbuf = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, sz, &buf[ptr], NULL);
        if (debug >= 1) printf("loading %p %d @ 0x%X\n", clbuf, sz, ptr);
        ptr += sz;
      } else {
        // TODO: is there a faster way to init zeroed out buffers?
        void *host_zeros = calloc(sz, 1);
        clbuf = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, sz, host_zeros, NULL);
        free(host_zeros);
      }
    }
    assert(clbuf != NULL);
    if (mobj["arg_type"] == "image2d_t" || mobj["arg_type"] == "image1d_t") {
      cl_image_desc desc = {0};
      desc.image_type = (mobj["arg_type"] == "image2d_t") ? CL_MEM_OBJECT_IMAGE2D : CL_MEM_OBJECT_IMAGE1D_BUFFER;
      desc.image_width = mobj["width"].int_value();
      desc.image_height = mobj["height"].int_value();
      desc.image_row_pitch = mobj["row_pitch"].int_value();
      assert(sz == desc.image_height*desc.image_row_pitch);
 #ifdef QCOM2
      desc.buffer = clbuf;
 #else
      // TODO: we are creating unused buffers on PC
      clReleaseMemObject(clbuf);
 #endif
      cl_image_format format = {0};
      format.image_channel_order = CL_RGBA;
      format.image_channel_data_type = mobj["float32"].bool_value() ? CL_FLOAT : CL_HALF_FLOAT;
      cl_int errcode;
 #ifndef QCOM2
      if (mobj["needs_load"].bool_value()) {
        clbuf = clCreateImage(context, CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, &format, &desc, &buf[ptr-sz], &errcode);
      } else {
        clbuf = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, &errcode);
      }
 #else
      clbuf = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, &errcode);
 #endif
      if (clbuf == NULL) {
        LOGE("clError: %s create image %zux%zu rp %zu with buffer %p\n", cl_get_error_string(errcode),
             desc.image_width, desc.image_height, desc.image_row_pitch, desc.buffer);
      }
      assert(clbuf != NULL);
    }
    real_mem[*(cl_mem*)(mobj["id"].string_value().data())] = clbuf;
  }
  map<string, cl_program> g_programs;
  for (const auto &[name, source] : jdat["programs"].object_items()) {
    if (debug >= 1) printf("building %s with size %zu\n", name.c_str(), source.string_value().size());
    g_programs[name] = cl_program_from_source(context, device_id, source.string_value());
  }
  for (auto &obj : jdat["inputs"].array_items()) {
    auto mobj = obj.object_items();
    int sz = mobj["size"].int_value();
    cl_mem aa = real_mem[*(cl_mem*)(mobj["buffer_id"].string_value().data())];
    input_clmem.push_back(aa);
    input_sizes.push_back(sz);
    LOGD("Thneed::load: adding input %s with size %d\n", mobj["name"].string_value().data(), sz);
    cl_int cl_err;
    void *ret = clEnqueueMapBuffer(command_queue, aa, CL_TRUE, CL_MAP_WRITE, 0, sz, 0, NULL, NULL, &cl_err);
    if (cl_err != CL_SUCCESS) LOGE("clError: %s map %p %d\n", cl_get_error_string(cl_err), aa, sz);
    assert(cl_err == CL_SUCCESS);
    inputs.push_back(ret);
  }
  for (auto &obj : jdat["outputs"].array_items()) {
    auto mobj = obj.object_items();
    int sz = mobj["size"].int_value();
    LOGD("Thneed::save: adding output with size %d\n", sz);
    // TODO: support multiple outputs
    output = real_mem[*(cl_mem*)(mobj["buffer_id"].string_value().data())];
    assert(output != NULL);
  }
  for (auto &obj : jdat["binaries"].array_items()) {
    string name = obj["name"].string_value();
    size_t length = obj["length"].int_value();
    if (debug >= 1) printf("binary %s with size %zu\n", name.c_str(), length);
    g_programs[name] = cl_program_from_binary(context, device_id, (const uint8_t*)&buf[ptr], length);
    ptr += length;
  }
  for (auto &obj : jdat["kernels"].array_items()) {
    auto gws = obj["global_work_size"];
    auto lws = obj["local_work_size"];
    auto kk = shared_ptr<CLQueuedKernel>(new CLQueuedKernel(this));
    kk->name = obj["name"].string_value();
    kk->program = g_programs[kk->name];
    kk->work_dim = obj["work_dim"].int_value();
    for (int i = 0; i < kk->work_dim; i++) {
      kk->global_work_size[i] = gws[i].int_value();
      kk->local_work_size[i] = lws[i].int_value();
    }
    kk->num_args = obj["num_args"].int_value();
    for (int i = 0; i < kk->num_args; i++) {
      string arg = obj["args"].array_items()[i].string_value();
      int arg_size = obj["args_size"].array_items()[i].int_value();
      kk->args_size.push_back(arg_size);
      if (arg_size == 8) {
        cl_mem val = *(cl_mem*)(arg.data());
        val = real_mem[val];
        kk->args.push_back(string((char*)&val, sizeof(val)));
      } else {
        kk->args.push_back(arg);
      }
    }
    kq.push_back(kk);
  }
  clFinish(command_queue);
 }
@@ -0,0 +1,133 @@
 #pragma once
 #ifndef __user
 #define __user __attribute__(())
 #endif
 #include <cstdint>
 #include <cstdlib>
 #include <memory>
 #include <string>
 #include <vector>
 #include <CL/cl.h>
 #include "third_party/linux/include/msm_kgsl.h"
 using namespace std;
 cl_int thneed_clSetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value);
 namespace json11 {
  class Json;
 }
 class Thneed;
 class GPUMalloc {
  public:
    GPUMalloc(int size, int fd);
    ~GPUMalloc();
    void *alloc(int size);
  private:
    uint64_t base;
    int remaining;
 };
 class CLQueuedKernel {
  public:
    CLQueuedKernel(Thneed *lthneed) { thneed = lthneed; }
    CLQueuedKernel(Thneed *lthneed,
                   cl_kernel _kernel,
                   cl_uint _work_dim,
                   const size_t *_global_work_size,
                   const size_t *_local_work_size);
    cl_int exec();
    void debug_print(bool verbose);
    int get_arg_num(const char *search_arg_name);
    cl_program program;
    string name;
    cl_uint num_args;
    vector<string> arg_names;
    vector<string> arg_types;
    vector<string> args;
    vector<int> args_size;
    cl_kernel kernel = NULL;
    json11::Json to_json() const;
    cl_uint work_dim;
    size_t global_work_size[3] = {0};
    size_t local_work_size[3] = {0};
  private:
    Thneed *thneed;
 };
 class CachedIoctl {
  public:
    virtual void exec() {}
 };
 class CachedSync: public CachedIoctl {
  public:
    CachedSync(Thneed *lthneed, string ldata) { thneed = lthneed; data = ldata; }
    void exec();
  private:
    Thneed *thneed;
    string data;
 };
 class CachedCommand: public CachedIoctl {
  public:
    CachedCommand(Thneed *lthneed, struct kgsl_gpu_command *cmd);
    void exec();
  private:
    void disassemble(int cmd_index);
    struct kgsl_gpu_command cache;
    unique_ptr<kgsl_command_object[]> cmds;
    unique_ptr<kgsl_command_object[]> objs;
    Thneed *thneed;
    vector<shared_ptr<CLQueuedKernel> > kq;
 };
 class Thneed {
  public:
    Thneed(bool do_clinit=false, cl_context _context = NULL);
    void stop();
    void execute(float **finputs, float *foutput, bool slow=false);
    void wait();
    vector<cl_mem> input_clmem;
    vector<void *> inputs;
    vector<size_t> input_sizes;
    cl_mem output = NULL;
    cl_context context = NULL;
    cl_command_queue command_queue;
    cl_device_id device_id;
    int context_id;
    // protected?
    bool record = false;
    int debug;
    int timestamp;
 #ifdef QCOM2
    unique_ptr<GPUMalloc> ram;
    vector<unique_ptr<CachedIoctl> > cmds;
    int fd;
 #endif
    // all CL kernels
    void copy_inputs(float **finputs, bool internal=false);
    void copy_output(float *foutput);
    cl_int clexec();
    vector<shared_ptr<CLQueuedKernel> > kq;
    // pending CL kernels
    vector<shared_ptr<CLQueuedKernel> > ckq;
    // loading
    void load(const char *filename);
  private:
    void clinit();
 };
@@ -0,0 +1,216 @@
 #include "sunnypilot/modeld/thneed/thneed.h"
 #include <cassert>
 #include <cstring>
 #include <map>
 #include "common/clutil.h"
 #include "common/timing.h"
 map<pair<cl_kernel, int>, string> g_args;
 map<pair<cl_kernel, int>, int> g_args_size;
 map<cl_program, string> g_program_source;
 void Thneed::stop() {
  //printf("Thneed::stop: recorded %lu commands\n", cmds.size());
  record = false;
 }
 void Thneed::clinit() {
  device_id = cl_get_device_id(CL_DEVICE_TYPE_DEFAULT);
  if (context == NULL) context = CL_CHECK_ERR(clCreateContext(NULL, 1, &device_id, NULL, NULL, &err));
  //cl_command_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
  cl_command_queue_properties props[3] = {CL_QUEUE_PROPERTIES, 0, 0};
  command_queue = CL_CHECK_ERR(clCreateCommandQueueWithProperties(context, device_id, props, &err));
  printf("Thneed::clinit done\n");
 }
 cl_int Thneed::clexec() {
  if (debug >= 1) printf("Thneed::clexec: running %lu queued kernels\n", kq.size());
  for (auto &k : kq) {
    if (record) ckq.push_back(k);
    cl_int ret = k->exec();
    assert(ret == CL_SUCCESS);
  }
  return clFinish(command_queue);
 }
 void Thneed::copy_inputs(float **finputs, bool internal) {
  for (int idx = 0; idx < inputs.size(); ++idx) {
    if (debug >= 1) printf("copying %lu -- %p -> %p (cl %p)\n", input_sizes[idx], finputs[idx], inputs[idx], input_clmem[idx]);
    if (internal) {
      // if it's internal, using memcpy is fine since the buffer sync is cached in the ioctl layer
      if (finputs[idx] != NULL) memcpy(inputs[idx], finputs[idx], input_sizes[idx]);
    } else {
      if (finputs[idx] != NULL) CL_CHECK(clEnqueueWriteBuffer(command_queue, input_clmem[idx], CL_TRUE, 0, input_sizes[idx], finputs[idx], 0, NULL, NULL));
    }
  }
 }
 void Thneed::copy_output(float *foutput) {
  if (output != NULL) {
    size_t sz;
    clGetMemObjectInfo(output, CL_MEM_SIZE, sizeof(sz), &sz, NULL);
    if (debug >= 1) printf("copying %lu for output %p -> %p\n", sz, output, foutput);
    CL_CHECK(clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, sz, foutput, 0, NULL, NULL));
  } else {
    printf("CAUTION: model output is NULL, does it have no outputs?\n");
  }
 }
 // *********** CLQueuedKernel ***********
 CLQueuedKernel::CLQueuedKernel(Thneed *lthneed,
                               cl_kernel _kernel,
                               cl_uint _work_dim,
                               const size_t *_global_work_size,
                               const size_t *_local_work_size) {
  thneed = lthneed;
  kernel = _kernel;
  work_dim = _work_dim;
  assert(work_dim <= 3);
  for (int i = 0; i < work_dim; i++) {
    global_work_size[i] = _global_work_size[i];
    local_work_size[i] = _local_work_size[i];
  }
  char _name[0x100];
  clGetKernelInfo(kernel, CL_KERNEL_FUNCTION_NAME, sizeof(_name), _name, NULL);
  name = string(_name);
  clGetKernelInfo(kernel, CL_KERNEL_NUM_ARGS, sizeof(num_args), &num_args, NULL);
  // get args
  for (int i = 0; i < num_args; i++) {
    char arg_name[0x100] = {0};
    clGetKernelArgInfo(kernel, i, CL_KERNEL_ARG_NAME, sizeof(arg_name), arg_name, NULL);
    arg_names.push_back(string(arg_name));
    clGetKernelArgInfo(kernel, i, CL_KERNEL_ARG_TYPE_NAME, sizeof(arg_name), arg_name, NULL);
    arg_types.push_back(string(arg_name));
    args.push_back(g_args[make_pair(kernel, i)]);
    args_size.push_back(g_args_size[make_pair(kernel, i)]);
  }
  // get program
  clGetKernelInfo(kernel, CL_KERNEL_PROGRAM, sizeof(program), &program, NULL);
 }
 int CLQueuedKernel::get_arg_num(const char *search_arg_name) {
  for (int i = 0; i < num_args; i++) {
    if (arg_names[i] == search_arg_name) return i;
  }
  printf("failed to find %s in %s\n", search_arg_name, name.c_str());
  assert(false);
 }
 cl_int CLQueuedKernel::exec() {
  if (kernel == NULL) {
    kernel = clCreateKernel(program, name.c_str(), NULL);
    arg_names.clear();
    arg_types.clear();
    for (int j = 0; j < num_args; j++) {
      char arg_name[0x100] = {0};
      clGetKernelArgInfo(kernel, j, CL_KERNEL_ARG_NAME, sizeof(arg_name), arg_name, NULL);
      arg_names.push_back(string(arg_name));
      clGetKernelArgInfo(kernel, j, CL_KERNEL_ARG_TYPE_NAME, sizeof(arg_name), arg_name, NULL);
      arg_types.push_back(string(arg_name));
      cl_int ret;
      if (args[j].size() != 0) {
        assert(args[j].size() == args_size[j]);
        ret = thneed_clSetKernelArg(kernel, j, args[j].size(), args[j].data());
      } else {
        ret = thneed_clSetKernelArg(kernel, j, args_size[j], NULL);
      }
      assert(ret == CL_SUCCESS);
    }
  }
  if (thneed->debug >= 1) {
    debug_print(thneed->debug >= 2);
  }
  return clEnqueueNDRangeKernel(thneed->command_queue,
    kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, NULL);
 }
 void CLQueuedKernel::debug_print(bool verbose) {
  printf("%p %56s -- ", kernel, name.c_str());
  for (int i = 0; i < work_dim; i++) {
    printf("%4zu ", global_work_size[i]);
  }
  printf(" -- ");
  for (int i = 0; i < work_dim; i++) {
    printf("%4zu ", local_work_size[i]);
  }
  printf("\n");
  if (verbose) {
    for (int i = 0; i < num_args; i++) {
      string arg = args[i];
      printf("  %s %s", arg_types[i].c_str(), arg_names[i].c_str());
      void *arg_value = (void*)arg.data();
      int arg_size = arg.size();
      if (arg_size == 0) {
        printf(" (size) %d", args_size[i]);
      } else if (arg_size == 1) {
        printf(" = %d", *((char*)arg_value));
      } else if (arg_size == 2) {
        printf(" = %d", *((short*)arg_value));
      } else if (arg_size == 4) {
        if (arg_types[i] == "float") {
          printf(" = %f", *((float*)arg_value));
        } else {
          printf(" = %d", *((int*)arg_value));
        }
      } else if (arg_size == 8) {
        cl_mem val = (cl_mem)(*((uintptr_t*)arg_value));
        printf(" = %p", val);
        if (val != NULL) {
          cl_mem_object_type obj_type;
          clGetMemObjectInfo(val, CL_MEM_TYPE, sizeof(obj_type), &obj_type, NULL);
          if (arg_types[i] == "image2d_t" || arg_types[i] == "image1d_t" || obj_type == CL_MEM_OBJECT_IMAGE2D) {
            cl_image_format format;
            size_t width, height, depth, array_size, row_pitch, slice_pitch;
            cl_mem buf;
            clGetImageInfo(val, CL_IMAGE_FORMAT, sizeof(format), &format, NULL);
            assert(format.image_channel_order == CL_RGBA);
            assert(format.image_channel_data_type == CL_HALF_FLOAT || format.image_channel_data_type == CL_FLOAT);
            clGetImageInfo(val, CL_IMAGE_WIDTH, sizeof(width), &width, NULL);
            clGetImageInfo(val, CL_IMAGE_HEIGHT, sizeof(height), &height, NULL);
            clGetImageInfo(val, CL_IMAGE_ROW_PITCH, sizeof(row_pitch), &row_pitch, NULL);
            clGetImageInfo(val, CL_IMAGE_DEPTH, sizeof(depth), &depth, NULL);
            clGetImageInfo(val, CL_IMAGE_ARRAY_SIZE, sizeof(array_size), &array_size, NULL);
            clGetImageInfo(val, CL_IMAGE_SLICE_PITCH, sizeof(slice_pitch), &slice_pitch, NULL);
            assert(depth == 0);
            assert(array_size == 0);
            assert(slice_pitch == 0);
            clGetImageInfo(val, CL_IMAGE_BUFFER, sizeof(buf), &buf, NULL);
            size_t sz = 0;
            if (buf != NULL) clGetMemObjectInfo(buf, CL_MEM_SIZE, sizeof(sz), &sz, NULL);
            printf(" image %zu x %zu rp %zu @ %p buffer %zu", width, height, row_pitch, buf, sz);
          } else {
            size_t sz;
            clGetMemObjectInfo(val, CL_MEM_SIZE, sizeof(sz), &sz, NULL);
            printf(" buffer %zu", sz);
          }
        }
      }
      printf("\n");
    }
  }
 }
 cl_int thneed_clSetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value) {
  g_args_size[make_pair(kernel, arg_index)] = arg_size;
  if (arg_value != NULL) {
    g_args[make_pair(kernel, arg_index)] = string((char*)arg_value, arg_size);
  } else {
    g_args[make_pair(kernel, arg_index)] = string("");
  }
  cl_int ret = clSetKernelArg(kernel, arg_index, arg_size, arg_value);
  return ret;
 }
@@ -0,0 +1,32 @@
 #include "sunnypilot/modeld/thneed/thneed.h"
 #include <cassert>
 #include "common/clutil.h"
 #include "common/timing.h"
 Thneed::Thneed(bool do_clinit, cl_context _context) {
  context = _context;
  if (do_clinit) clinit();
  char *thneed_debug_env = getenv("THNEED_DEBUG");
  debug = (thneed_debug_env != NULL) ? atoi(thneed_debug_env) : 0;
 }
 void Thneed::execute(float **finputs, float *foutput, bool slow) {
  uint64_t tb, te;
  if (debug >= 1) tb = nanos_since_boot();
  // ****** copy inputs
  copy_inputs(finputs);
  // ****** run commands
  clexec();
  // ****** copy outputs
  copy_output(foutput);
  if (debug >= 1) {
    te = nanos_since_boot();
    printf("model exec in %lu us\n", (te-tb)/1000);
  }
 }
@@ -0,0 +1,258 @@
 #include "sunnypilot/modeld/thneed/thneed.h"
 #include <dlfcn.h>
 #include <sys/mman.h>
 #include <cassert>
 #include <cerrno>
 #include <cstring>
 #include <map>
 #include <string>
 #include "common/clutil.h"
 #include "common/timing.h"
 Thneed *g_thneed = NULL;
 int g_fd = -1;
 void hexdump(uint8_t *d, int len) {
  assert((len%4) == 0);
  printf("  dumping %p len 0x%x\n", d, len);
  for (int i = 0; i < len/4; i++) {
    if (i != 0 && (i%0x10) == 0) printf("\n");
    printf("%8x ", d[i]);
  }
  printf("\n");
 }
 // *********** ioctl interceptor ***********
 extern "C" {
 int (*my_ioctl)(int filedes, unsigned long request, void *argp) = NULL;
 #undef ioctl
 int ioctl(int filedes, unsigned long request, void *argp) {
  request &= 0xFFFFFFFF;  // needed on QCOM2
  if (my_ioctl == NULL) my_ioctl = reinterpret_cast<decltype(my_ioctl)>(dlsym(RTLD_NEXT, "ioctl"));
  Thneed *thneed = g_thneed;
  // save the fd
  if (request == IOCTL_KGSL_GPUOBJ_ALLOC) g_fd = filedes;
  // note that this runs always, even without a thneed object
  if (request == IOCTL_KGSL_DRAWCTXT_CREATE) {
    struct kgsl_drawctxt_create *create = (struct kgsl_drawctxt_create *)argp;
    create->flags &= ~KGSL_CONTEXT_PRIORITY_MASK;
    create->flags |= 6 << KGSL_CONTEXT_PRIORITY_SHIFT;   // priority from 1-15, 1 is max priority
    printf("IOCTL_KGSL_DRAWCTXT_CREATE: creating context with flags 0x%x\n", create->flags);
  }
  if (thneed != NULL) {
    if (request == IOCTL_KGSL_GPU_COMMAND) {
      struct kgsl_gpu_command *cmd = (struct kgsl_gpu_command *)argp;
      if (thneed->record) {
        thneed->timestamp = cmd->timestamp;
        thneed->context_id = cmd->context_id;
        thneed->cmds.push_back(unique_ptr<CachedCommand>(new CachedCommand(thneed, cmd)));
      }
      if (thneed->debug >= 1) {
        printf("IOCTL_KGSL_GPU_COMMAND(%2zu): flags: 0x%lx    context_id: %u  timestamp: %u  numcmds: %d  numobjs: %d\n",
            thneed->cmds.size(),
            cmd->flags,
            cmd->context_id, cmd->timestamp, cmd->numcmds, cmd->numobjs);
      }
    } else if (request == IOCTL_KGSL_GPUOBJ_SYNC) {
      struct kgsl_gpuobj_sync *cmd = (struct kgsl_gpuobj_sync *)argp;
      struct kgsl_gpuobj_sync_obj *objs = (struct kgsl_gpuobj_sync_obj *)(cmd->objs);
      if (thneed->debug >= 2) {
        printf("IOCTL_KGSL_GPUOBJ_SYNC count:%d ", cmd->count);
        for (int i = 0; i < cmd->count; i++) {
          printf(" -- offset:0x%lx len:0x%lx id:%d op:%d  ", objs[i].offset, objs[i].length, objs[i].id, objs[i].op);
        }
        printf("\n");
      }
      if (thneed->record) {
        thneed->cmds.push_back(unique_ptr<CachedSync>(new
              CachedSync(thneed, string((char *)objs, sizeof(struct kgsl_gpuobj_sync_obj)*cmd->count))));
      }
    } else if (request == IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID) {
      struct kgsl_device_waittimestamp_ctxtid *cmd = (struct kgsl_device_waittimestamp_ctxtid *)argp;
      if (thneed->debug >= 1) {
        printf("IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID: context_id: %d  timestamp: %d  timeout: %d\n",
            cmd->context_id, cmd->timestamp, cmd->timeout);
      }
    } else if (request == IOCTL_KGSL_SETPROPERTY) {
      if (thneed->debug >= 1) {
        struct kgsl_device_getproperty *prop = (struct kgsl_device_getproperty *)argp;
        printf("IOCTL_KGSL_SETPROPERTY: 0x%x sizebytes:%zu\n", prop->type, prop->sizebytes);
        if (thneed->debug >= 2) {
          hexdump((uint8_t *)prop->value, prop->sizebytes);
          if (prop->type == KGSL_PROP_PWR_CONSTRAINT) {
            struct kgsl_device_constraint *constraint = (struct kgsl_device_constraint *)prop->value;
            hexdump((uint8_t *)constraint->data, constraint->size);
          }
        }
      }
    } else if (request == IOCTL_KGSL_DRAWCTXT_CREATE || request == IOCTL_KGSL_DRAWCTXT_DESTROY) {
      // this happens
    } else if (request == IOCTL_KGSL_GPUOBJ_ALLOC || request == IOCTL_KGSL_GPUOBJ_FREE) {
      // this happens
    } else {
      if (thneed->debug >= 1) {
        printf("other ioctl %lx\n", request);
      }
    }
  }
  int ret = my_ioctl(filedes, request, argp);
  // NOTE: This error message goes into stdout and messes up pyenv
  // if (ret != 0) printf("ioctl returned %d with errno %d\n", ret, errno);
  return ret;
 }
 }
 // *********** GPUMalloc ***********
 GPUMalloc::GPUMalloc(int size, int fd) {
  struct kgsl_gpuobj_alloc alloc;
  memset(&alloc, 0, sizeof(alloc));
  alloc.size = size;
  alloc.flags = 0x10000a00;
  ioctl(fd, IOCTL_KGSL_GPUOBJ_ALLOC, &alloc);
  void *addr = mmap64(NULL, alloc.mmapsize, 0x3, 0x1, fd, alloc.id*0x1000);
  assert(addr != MAP_FAILED);
  base = (uint64_t)addr;
  remaining = size;
 }
 GPUMalloc::~GPUMalloc() {
  // TODO: free the GPU malloced area
 }
 void *GPUMalloc::alloc(int size) {
  void *ret = (void*)base;
  size = (size+0xff) & (~0xFF);
  assert(size <= remaining);
  remaining -= size;
  base += size;
  return ret;
 }
 // *********** CachedSync, at the ioctl layer ***********
 void CachedSync::exec() {
  struct kgsl_gpuobj_sync cmd;
  cmd.objs = (uint64_t)data.data();
  cmd.obj_len = data.length();
  cmd.count = data.length() / sizeof(struct kgsl_gpuobj_sync_obj);
  int ret = ioctl(thneed->fd, IOCTL_KGSL_GPUOBJ_SYNC, &cmd);
  assert(ret == 0);
 }
 // *********** CachedCommand, at the ioctl layer ***********
 CachedCommand::CachedCommand(Thneed *lthneed, struct kgsl_gpu_command *cmd) {
  thneed = lthneed;
  assert(cmd->numsyncs == 0);
  memcpy(&cache, cmd, sizeof(cache));
  if (cmd->numcmds > 0) {
    cmds = make_unique<struct kgsl_command_object[]>(cmd->numcmds);
    memcpy(cmds.get(), (void *)cmd->cmdlist, sizeof(struct kgsl_command_object)*cmd->numcmds);
    cache.cmdlist = (uint64_t)cmds.get();
    for (int i = 0; i < cmd->numcmds; i++) {
      void *nn = thneed->ram->alloc(cmds[i].size);
      memcpy(nn, (void*)cmds[i].gpuaddr, cmds[i].size);
      cmds[i].gpuaddr = (uint64_t)nn;
    }
  }
  if (cmd->numobjs > 0) {
    objs = make_unique<struct kgsl_command_object[]>(cmd->numobjs);
    memcpy(objs.get(), (void *)cmd->objlist, sizeof(struct kgsl_command_object)*cmd->numobjs);
    cache.objlist = (uint64_t)objs.get();
    for (int i = 0; i < cmd->numobjs; i++) {
      void *nn = thneed->ram->alloc(objs[i].size);
      memset(nn, 0, objs[i].size);
      objs[i].gpuaddr = (uint64_t)nn;
    }
  }
  kq = thneed->ckq;
  thneed->ckq.clear();
 }
 void CachedCommand::exec() {
  cache.timestamp = ++thneed->timestamp;
  int ret = ioctl(thneed->fd, IOCTL_KGSL_GPU_COMMAND, &cache);
  if (thneed->debug >= 1) printf("CachedCommand::exec got %d\n", ret);
  if (thneed->debug >= 2) {
    for (auto &it : kq) {
      it->debug_print(false);
    }
  }
  assert(ret == 0);
 }
 // *********** Thneed ***********
 Thneed::Thneed(bool do_clinit, cl_context _context) {
  // TODO: QCOM2 actually requires a different context
  //context = _context;
  if (do_clinit) clinit();
  assert(g_fd != -1);
  fd = g_fd;
  ram = make_unique<GPUMalloc>(0x80000, fd);
  timestamp = -1;
  g_thneed = this;
  char *thneed_debug_env = getenv("THNEED_DEBUG");
  debug = (thneed_debug_env != NULL) ? atoi(thneed_debug_env) : 0;
 }
 void Thneed::wait() {
  struct kgsl_device_waittimestamp_ctxtid wait;
  wait.context_id = context_id;
  wait.timestamp = timestamp;
  wait.timeout = -1;
  uint64_t tb = nanos_since_boot();
  int wret = ioctl(fd, IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID, &wait);
  uint64_t te = nanos_since_boot();
  if (debug >= 1) printf("wait %d after %lu us\n", wret, (te-tb)/1000);
 }
 void Thneed::execute(float **finputs, float *foutput, bool slow) {
  uint64_t tb, te;
  if (debug >= 1) tb = nanos_since_boot();
  // ****** copy inputs
  copy_inputs(finputs, true);
  // ****** run commands
  int i = 0;
  for (auto &it : cmds) {
    ++i;
    if (debug >= 1) printf("run %2d @ %7lu us: ", i, (nanos_since_boot()-tb)/1000);
    it->exec();
    if ((i == cmds.size()) || slow) wait();
  }
  // ****** copy outputs
  copy_output(foutput);
  if (debug >= 1) {
    te = nanos_since_boot();
    printf("model exec in %lu us\n", (te-tb)/1000);
  }
 }
@@ -0,0 +1,76 @@
 #include "selfdrive/modeld/transforms/loadyuv.h"
 #include <cassert>
 #include <cstdio>
 #include <cstring>
 void loadyuv_init(LoadYUVState* s, cl_context ctx, cl_device_id device_id, int width, int height) {
  memset(s, 0, sizeof(*s));
  s->width = width;
  s->height = height;
  char args[1024];
  snprintf(args, sizeof(args),
           "-cl-fast-relaxed-math -cl-denorms-are-zero "
           "-DTRANSFORMED_WIDTH=%d -DTRANSFORMED_HEIGHT=%d",
           width, height);
  cl_program prg = cl_program_from_file(ctx, device_id, LOADYUV_PATH, args);
  s->loadys_krnl = CL_CHECK_ERR(clCreateKernel(prg, "loadys", &err));
  s->loaduv_krnl = CL_CHECK_ERR(clCreateKernel(prg, "loaduv", &err));
  s->copy_krnl = CL_CHECK_ERR(clCreateKernel(prg, "copy", &err));
  // done with this
  CL_CHECK(clReleaseProgram(prg));
 }
 void loadyuv_destroy(LoadYUVState* s) {
  CL_CHECK(clReleaseKernel(s->loadys_krnl));
  CL_CHECK(clReleaseKernel(s->loaduv_krnl));
  CL_CHECK(clReleaseKernel(s->copy_krnl));
 }
 void loadyuv_queue(LoadYUVState* s, cl_command_queue q,
                   cl_mem y_cl, cl_mem u_cl, cl_mem v_cl,
                   cl_mem out_cl) {
  cl_int global_out_off = 0;
  CL_CHECK(clSetKernelArg(s->loadys_krnl, 0, sizeof(cl_mem), &y_cl));
  CL_CHECK(clSetKernelArg(s->loadys_krnl, 1, sizeof(cl_mem), &out_cl));
  CL_CHECK(clSetKernelArg(s->loadys_krnl, 2, sizeof(cl_int), &global_out_off));
  const size_t loadys_work_size = (s->width*s->height)/8;
  CL_CHECK(clEnqueueNDRangeKernel(q, s->loadys_krnl, 1, NULL,
                               &loadys_work_size, NULL, 0, 0, NULL));
  const size_t loaduv_work_size = ((s->width/2)*(s->height/2))/8;
  global_out_off += (s->width*s->height);
  CL_CHECK(clSetKernelArg(s->loaduv_krnl, 0, sizeof(cl_mem), &u_cl));
  CL_CHECK(clSetKernelArg(s->loaduv_krnl, 1, sizeof(cl_mem), &out_cl));
  CL_CHECK(clSetKernelArg(s->loaduv_krnl, 2, sizeof(cl_int), &global_out_off));
  CL_CHECK(clEnqueueNDRangeKernel(q, s->loaduv_krnl, 1, NULL,
                               &loaduv_work_size, NULL, 0, 0, NULL));
  global_out_off += (s->width/2)*(s->height/2);
  CL_CHECK(clSetKernelArg(s->loaduv_krnl, 0, sizeof(cl_mem), &v_cl));
  CL_CHECK(clSetKernelArg(s->loaduv_krnl, 1, sizeof(cl_mem), &out_cl));
  CL_CHECK(clSetKernelArg(s->loaduv_krnl, 2, sizeof(cl_int), &global_out_off));
  CL_CHECK(clEnqueueNDRangeKernel(q, s->loaduv_krnl, 1, NULL,
                               &loaduv_work_size, NULL, 0, 0, NULL));
 }
 void copy_queue(LoadYUVState* s, cl_command_queue q, cl_mem src, cl_mem dst,
                 size_t src_offset, size_t dst_offset, size_t size) {
  CL_CHECK(clSetKernelArg(s->copy_krnl, 0, sizeof(cl_mem), &src));
  CL_CHECK(clSetKernelArg(s->copy_krnl, 1, sizeof(cl_mem), &dst));
  CL_CHECK(clSetKernelArg(s->copy_krnl, 2, sizeof(cl_int), &src_offset));
  CL_CHECK(clSetKernelArg(s->copy_krnl, 3, sizeof(cl_int), &dst_offset));
  const size_t copy_work_size = size/8;
  CL_CHECK(clEnqueueNDRangeKernel(q, s->copy_krnl, 1, NULL,
                              &copy_work_size, NULL, 0, 0, NULL));
 }
@@ -0,0 +1,47 @@
 #define UV_SIZE ((TRANSFORMED_WIDTH/2)*(TRANSFORMED_HEIGHT/2))
 __kernel void loadys(__global uchar8 const * const Y,
                     __global uchar * out,
                     int out_offset)
 {
    const int gid = get_global_id(0);
    const int ois = gid * 8;
    const int oy = ois / TRANSFORMED_WIDTH;
    const int ox = ois % TRANSFORMED_WIDTH;
    const uchar8 ys = Y[gid];
    // 02
    // 13
    __global uchar* outy0;
    __global uchar* outy1;
    if ((oy & 1) == 0) {
      outy0 = out + out_offset; //y0
      outy1 = out + out_offset + UV_SIZE*2; //y2
    } else {
      outy0 = out + out_offset + UV_SIZE; //y1
      outy1 = out + out_offset + UV_SIZE*3; //y3
    }
    vstore4(ys.s0246, 0, outy0 + (oy/2) * (TRANSFORMED_WIDTH/2) + ox/2);
    vstore4(ys.s1357, 0, outy1 + (oy/2) * (TRANSFORMED_WIDTH/2) + ox/2);
 }
 __kernel void loaduv(__global uchar8 const * const in,
                     __global uchar8 * out,
                     int out_offset)
 {
  const int gid = get_global_id(0);
  const uchar8 inv = in[gid];
  out[gid + out_offset / 8] = inv;
 }
 __kernel void copy(__global uchar8 * in,
                   __global uchar8 * out,
                   int in_offset,
                   int out_offset)
 {
  const int gid = get_global_id(0);
  out[gid + out_offset / 8] = in[gid + in_offset / 8];
 }
@@ -0,0 +1,20 @@
 #pragma once
 #include "common/clutil.h"
 typedef struct {
  int width, height;
  cl_kernel loadys_krnl, loaduv_krnl, copy_krnl;
 } LoadYUVState;
 void loadyuv_init(LoadYUVState* s, cl_context ctx, cl_device_id device_id, int width, int height);
 void loadyuv_destroy(LoadYUVState* s);
 void loadyuv_queue(LoadYUVState* s, cl_command_queue q,
                   cl_mem y_cl, cl_mem u_cl, cl_mem v_cl,
                   cl_mem out_cl);
 void copy_queue(LoadYUVState* s, cl_command_queue q, cl_mem src, cl_mem dst,
                 size_t src_offset, size_t dst_offset, size_t size);
@@ -0,0 +1,97 @@
 #include "selfdrive/modeld/transforms/transform.h"
 #include <cassert>
 #include <cstring>
 #include "common/clutil.h"
 void transform_init(Transform* s, cl_context ctx, cl_device_id device_id) {
  memset(s, 0, sizeof(*s));
  cl_program prg = cl_program_from_file(ctx, device_id, TRANSFORM_PATH, "");
  s->krnl = CL_CHECK_ERR(clCreateKernel(prg, "warpPerspective", &err));
  // done with this
  CL_CHECK(clReleaseProgram(prg));
  s->m_y_cl = CL_CHECK_ERR(clCreateBuffer(ctx, CL_MEM_READ_WRITE, 3*3*sizeof(float), NULL, &err));
  s->m_uv_cl = CL_CHECK_ERR(clCreateBuffer(ctx, CL_MEM_READ_WRITE, 3*3*sizeof(float), NULL, &err));
 }
 void transform_destroy(Transform* s) {
  CL_CHECK(clReleaseMemObject(s->m_y_cl));
  CL_CHECK(clReleaseMemObject(s->m_uv_cl));
  CL_CHECK(clReleaseKernel(s->krnl));
 }
 void transform_queue(Transform* s,
                     cl_command_queue q,
                     cl_mem in_yuv, int in_width, int in_height, int in_stride, int in_uv_offset,
                     cl_mem out_y, cl_mem out_u, cl_mem out_v,
                     int out_width, int out_height,
                     const mat3& projection) {
  const int zero = 0;
  // sampled using pixel center origin
  // (because that's how fastcv and opencv does it)
  mat3 projection_y = projection;
  // in and out uv is half the size of y.
  mat3 projection_uv = transform_scale_buffer(projection, 0.5);
  CL_CHECK(clEnqueueWriteBuffer(q, s->m_y_cl, CL_TRUE, 0, 3*3*sizeof(float), (void*)projection_y.v, 0, NULL, NULL));
  CL_CHECK(clEnqueueWriteBuffer(q, s->m_uv_cl, CL_TRUE, 0, 3*3*sizeof(float), (void*)projection_uv.v, 0, NULL, NULL));
  const int in_y_width = in_width;
  const int in_y_height = in_height;
  const int in_y_px_stride = 1;
  const int in_uv_width = in_width/2;
  const int in_uv_height = in_height/2;
  const int in_uv_px_stride = 2;
  const int in_u_offset = in_uv_offset;
  const int in_v_offset = in_uv_offset + 1;
  const int out_y_width = out_width;
  const int out_y_height = out_height;
  const int out_uv_width = out_width/2;
  const int out_uv_height = out_height/2;
  CL_CHECK(clSetKernelArg(s->krnl, 0, sizeof(cl_mem), &in_yuv));  // src
  CL_CHECK(clSetKernelArg(s->krnl, 1, sizeof(cl_int), &in_stride));  // src_row_stride
  CL_CHECK(clSetKernelArg(s->krnl, 2, sizeof(cl_int), &in_y_px_stride));  // src_px_stride
  CL_CHECK(clSetKernelArg(s->krnl, 3, sizeof(cl_int), &zero));  // src_offset
  CL_CHECK(clSetKernelArg(s->krnl, 4, sizeof(cl_int), &in_y_height));  // src_rows
  CL_CHECK(clSetKernelArg(s->krnl, 5, sizeof(cl_int), &in_y_width));  // src_cols
  CL_CHECK(clSetKernelArg(s->krnl, 6, sizeof(cl_mem), &out_y));  // dst
  CL_CHECK(clSetKernelArg(s->krnl, 7, sizeof(cl_int), &out_y_width));  // dst_row_stride
  CL_CHECK(clSetKernelArg(s->krnl, 8, sizeof(cl_int), &zero));  // dst_offset
  CL_CHECK(clSetKernelArg(s->krnl, 9, sizeof(cl_int), &out_y_height));  // dst_rows
  CL_CHECK(clSetKernelArg(s->krnl, 10, sizeof(cl_int), &out_y_width));  // dst_cols
  CL_CHECK(clSetKernelArg(s->krnl, 11, sizeof(cl_mem), &s->m_y_cl));  // M
  const size_t work_size_y[2] = {(size_t)out_y_width, (size_t)out_y_height};
  CL_CHECK(clEnqueueNDRangeKernel(q, s->krnl, 2, NULL,
                              (const size_t*)&work_size_y, NULL, 0, 0, NULL));
  const size_t work_size_uv[2] = {(size_t)out_uv_width, (size_t)out_uv_height};
  CL_CHECK(clSetKernelArg(s->krnl, 2, sizeof(cl_int), &in_uv_px_stride));  // src_px_stride
  CL_CHECK(clSetKernelArg(s->krnl, 3, sizeof(cl_int), &in_u_offset));  // src_offset
  CL_CHECK(clSetKernelArg(s->krnl, 4, sizeof(cl_int), &in_uv_height));  // src_rows
  CL_CHECK(clSetKernelArg(s->krnl, 5, sizeof(cl_int), &in_uv_width));  // src_cols
  CL_CHECK(clSetKernelArg(s->krnl, 6, sizeof(cl_mem), &out_u));  // dst
  CL_CHECK(clSetKernelArg(s->krnl, 7, sizeof(cl_int), &out_uv_width));  // dst_row_stride
  CL_CHECK(clSetKernelArg(s->krnl, 8, sizeof(cl_int), &zero));  // dst_offset
  CL_CHECK(clSetKernelArg(s->krnl, 9, sizeof(cl_int), &out_uv_height));  // dst_rows
  CL_CHECK(clSetKernelArg(s->krnl, 10, sizeof(cl_int), &out_uv_width));  // dst_cols
  CL_CHECK(clSetKernelArg(s->krnl, 11, sizeof(cl_mem), &s->m_uv_cl));  // M
  CL_CHECK(clEnqueueNDRangeKernel(q, s->krnl, 2, NULL,
                              (const size_t*)&work_size_uv, NULL, 0, 0, NULL));
  CL_CHECK(clSetKernelArg(s->krnl, 3, sizeof(cl_int), &in_v_offset));  // src_ofset
  CL_CHECK(clSetKernelArg(s->krnl, 6, sizeof(cl_mem), &out_v));  // dst
  CL_CHECK(clEnqueueNDRangeKernel(q, s->krnl, 2, NULL,
                              (const size_t*)&work_size_uv, NULL, 0, 0, NULL));
 }
@@ -0,0 +1,54 @@
 #define INTER_BITS 5
 #define INTER_TAB_SIZE (1 << INTER_BITS)
 #define INTER_SCALE 1.f / INTER_TAB_SIZE
 #define INTER_REMAP_COEF_BITS 15
 #define INTER_REMAP_COEF_SCALE (1 << INTER_REMAP_COEF_BITS)
 __kernel void warpPerspective(__global const uchar * src,
                              int src_row_stride, int src_px_stride, int src_offset, int src_rows, int src_cols,
                              __global uchar * dst,
                              int dst_row_stride, int dst_offset, int dst_rows, int dst_cols,
                              __constant float * M)
 {
    int dx = get_global_id(0);
    int dy = get_global_id(1);
    if (dx < dst_cols && dy < dst_rows)
    {
        float X0 = M[0] * dx + M[1] * dy + M[2];
        float Y0 = M[3] * dx + M[4] * dy + M[5];
        float W = M[6] * dx + M[7] * dy + M[8];
        W = W != 0.0f ? INTER_TAB_SIZE / W : 0.0f;
        int X = rint(X0 * W), Y = rint(Y0 * W);
        int sx = convert_short_sat(X >> INTER_BITS);
        int sy = convert_short_sat(Y >> INTER_BITS);
        short sx_clamp = clamp(sx, 0, src_cols - 1);
        short sx_p1_clamp = clamp(sx + 1, 0, src_cols - 1);
        short sy_clamp = clamp(sy, 0, src_rows - 1);
        short sy_p1_clamp = clamp(sy + 1, 0, src_rows - 1);
        int v0 = convert_int(src[mad24(sy_clamp, src_row_stride, src_offset + sx_clamp*src_px_stride)]);
        int v1 = convert_int(src[mad24(sy_clamp, src_row_stride, src_offset + sx_p1_clamp*src_px_stride)]);
        int v2 = convert_int(src[mad24(sy_p1_clamp, src_row_stride, src_offset + sx_clamp*src_px_stride)]);
        int v3 = convert_int(src[mad24(sy_p1_clamp, src_row_stride, src_offset + sx_p1_clamp*src_px_stride)]);
        short ay = (short)(Y & (INTER_TAB_SIZE - 1));
        short ax = (short)(X & (INTER_TAB_SIZE - 1));
        float taby = 1.f/INTER_TAB_SIZE*ay;
        float tabx = 1.f/INTER_TAB_SIZE*ax;
        int dst_index = mad24(dy, dst_row_stride, dst_offset + dx);
        int itab0 = convert_short_sat_rte( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE );
        int itab1 = convert_short_sat_rte( (1.0f-taby)*tabx * INTER_REMAP_COEF_SCALE );
        int itab2 = convert_short_sat_rte( taby*(1.0f-tabx) * INTER_REMAP_COEF_SCALE );
        int itab3 = convert_short_sat_rte( taby*tabx * INTER_REMAP_COEF_SCALE );
        int val = v0 * itab0 +  v1 * itab1 + v2 * itab2 + v3 * itab3;
        uchar pix = convert_uchar_sat((val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS);
        dst[dst_index] = pix;
    }
 }
@@ -0,0 +1,25 @@
 #pragma once
 #define CL_USE_DEPRECATED_OPENCL_1_2_APIS
 #ifdef __APPLE__
 #include <OpenCL/cl.h>
 #else
 #include <CL/cl.h>
 #endif
 #include "common/mat.h"
 typedef struct {
  cl_kernel krnl;
  cl_mem m_y_cl, m_uv_cl;
 } Transform;
 void transform_init(Transform* s, cl_context ctx, cl_device_id device_id);
 void transform_destroy(Transform* transform);
 void transform_queue(Transform* s, cl_command_queue q,
                     cl_mem yuv, int in_width, int in_height, int in_stride, int in_uv_offset,
                     cl_mem out_y, cl_mem out_u, cl_mem out_v,
                     int out_width, int out_height,
                     const mat3& projection);
@@ -21,6 +21,7 @@ async def verify_file(file_path: str, expected_hash: str) -> bool:
  return sha256_hash.hexdigest().lower() == expected_hash.lower()
 def get_active_bundle(params: Params) -> custom.ModelManagerSP.ModelBundle:
  """Gets the active model bundle from cache"""
  if params is None:
@@ -30,3 +31,23 @@ def get_active_bundle(params: Params) -> custom.ModelManagerSP.ModelBundle:
    return messaging.log_from_bytes(active_bundle, custom.ModelManagerSP.ModelBundle)
  return None
 def get_model_runner_by_filename(filename: str) -> custom.ModelManagerSP.Runner:
  if filename.endswith(".thneed"):
    return custom.ModelManagerSP.Runner.snpe
  if filename.endswith("_tinygrad.pkl"):
    return custom.ModelManagerSP.Runner.tinygrad
 def get_active_model_runner(params: Params) -> custom.ModelManagerSP.Runner:
  """Gets the model runner from the active model bundle. If no active bundle, returns tinygrad"""
  if params is None:
    params = Params()
  if active_bundle := get_active_bundle(params):
    drive_model = next(model for model in active_bundle.models if model.type == custom.ModelManagerSP.Type.drive)
    return get_model_runner_by_filename(drive_model.fileName)
  return custom.ModelManagerSP.Runner.tinygrad
@@ -49,9 +49,11 @@ class ModelManagerSP:
  async def _download_file(self, url: str, path: str, model) -> None:
    """Downloads a file with progress tracking"""
    self._download_start_times[model.fileName] = time.monotonic()
    cloudlog.debug(f"Downloading {url} to {path}")
    async with aiohttp.ClientSession() as session:
      async with session.get(url) as response:
        cloudlog.debug(f"Response status: {response.status}")
        response.raise_for_status()
        total_size = int(response.headers.get("content-length", 0))
        bytes_downloaded = 0
@@ -125,12 +127,15 @@ class ModelManagerSP:
    """Downloads all models in a bundle"""
    self.selected_bundle = model_bundle
    self.selected_bundle.status = custom.ModelManagerSP.DownloadStatus.downloading
    cloudlog.debug(f"Downloading bundle {model_bundle.displayName} to {destination_path}")
    os.makedirs(destination_path, exist_ok=True)
    try:
      cloudlog.debug(f"Downloading {len(self.selected_bundle.models)} models")
      tasks = [self._process_model(model, destination_path)
               for model in self.selected_bundle.models]
      await asyncio.gather(*tasks)
      cloudlog.debug(f"Downloaded {len(self.selected_bundle.models)} models")
      self.selected_bundle.status = custom.ModelManagerSP.DownloadStatus.downloaded
      self.active_bundle = self.selected_bundle
      self.params.put("ModelManager_ActiveBundle", self.selected_bundle.to_bytes())
@@ -155,7 +160,9 @@ class ModelManagerSP:
        self.available_models = self.model_fetcher.get_available_models()
        if index_to_download := self.params.get("ModelManager_DownloadIndex", block=False, encoding="utf-8"):
          cloudlog.debug(f"Downloading model with index {index_to_download}")
          if model_to_download := next((model for model in self.available_models if model.index == int(index_to_download)), None):
            cloudlog.debug(f"Downloading model {model_to_download.displayName}")
            try:
              self.download(model_to_download, Paths.model_root())
            except Exception as e:
@@ -1,10 +1,12 @@
 import os
 import operator
-from cereal import car
+from cereal import car, custom
 from openpilot.common.params import Params
 from openpilot.system.hardware import PC, TICI
 from openpilot.system.manager.process import PythonProcess, NativeProcess, DaemonProcess
 from sunnypilot.models.helpers import get_active_model_runner
 from sunnypilot.sunnylink.utils import sunnylink_need_register, sunnylink_ready, use_sunnylink_uploader
 WEBCAM = os.getenv("USE_WEBCAM") is not None
@@ -70,6 +72,15 @@ def use_sunnylink_uploader_shim(started, params, CP: car.CarParams) -> bool:
  """Shim for use_sunnylink_uploader to match the process manager signature."""
  return use_sunnylink_uploader(params)
 def is_snpe_model(started, params, CP: car.CarParams) -> bool:
  """Check if the active model runner is SNPE."""
  # TODO-SP: I want to do a little more optimization here to only check this once when we've transitioned from offroad to onroad.
  return bool(get_active_model_runner(params) == custom.ModelManagerSP.Runner.snpe)
 def is_stock_model(started, params, CP: car.CarParams) -> bool:
  """Check if the active model runner is stock."""
  return not is_snpe_model(started, params, CP)
 def or_(*fns):
  return lambda *args: operator.or_(*(fn(*args) for fn in fns))
@@ -92,7 +103,7 @@ procs = [
  NativeProcess("stream_encoderd", "system/loggerd", ["./encoderd", "--stream"], notcar),
  NativeProcess("loggerd", "system/loggerd", ["./loggerd"], logging),
  # TODO Make python process once TG allows opening QCOM from child proc
-  NativeProcess("modeld", "selfdrive/modeld", ["./modeld"], only_onroad),
+  NativeProcess("modeld", "selfdrive/modeld", ["./modeld"], and_(only_onroad, is_stock_model)),
  NativeProcess("sensord", "system/sensord", ["./sensord"], only_onroad, enabled=not PC),
  NativeProcess("ui", "selfdrive/ui", ["./ui"], always_run, watchdog_max_dt=(5 if not PC else None)),
  PythonProcess("soundd", "selfdrive.ui.soundd", only_onroad),
@@ -134,6 +145,7 @@ procs = [
 # sunnypilot
 procs += [
  PythonProcess("models_manager", "sunnypilot.models.manager", only_offroad),
  NativeProcess("modeld_snpe", "sunnypilot/modeld", ["./modeld"], and_(only_onroad, is_snpe_model)),
 ]
 if os.path.exists("./github_runner.sh"):
Author	SHA1	Message	Date
DevTekVE	8120930372	Merge branch 'refs/heads/model-manager-improvements' into model-selector-multi-runner-debug-model-download	2025-01-05 22:55:45 +01:00
DevTekVE	adc9f2cde8	Enable model label button only when conditions are met Previously, the button's state update was misplaced, leading to potential issues with its interactive availability. The logic has been adjusted to ensure it is properly enabled or disabled based on onroad status and download progress. This change improves UX consistency and prevents unintended actions.	2025-01-05 22:54:29 +01:00
DevTekVE	0c89a58e91	Using is_onroad softwarePanel	2025-01-05 22:50:05 +01:00
DevTekVE	c467f8ea08	Merge branch 'refs/heads/model-manager-improvements' into model-selector-multi-runner-debug-model-download	2025-01-05 22:28:53 +01:00
DevTekVE	aa9f830bf4	Update model manager logic and handle offroad transitions Added is_onroad state tracking in SoftwarePanelSP to handle offroad transitions. Updated model manager conditions for improved bundle validation. Removed unnecessary clear operation for ModelManager_DownloadIndex during offroad transitions to optimize behavior.	2025-01-05 22:28:26 +01:00
DevTekVE	30ed809a60	fix prints	2025-01-05 21:24:42 +01:00
DevTekVE	df7d1ef256	Add detailed debug logging to model download process Enhanced logging provides better traceability during the download process. New debug logs include information such as URLs, file paths, response statuses, and model details. This facilitates easier debugging and monitoring of the model download workflow.	2025-01-05 20:12:06 +01:00
Jason Wen	491373bbd6	need this	2025-01-05 10:09:31 -05:00
Jason Wen	0f1c952a3e	must add this back	2025-01-05 10:09:00 -05:00
Jason Wen	6f57822ba1	bring onnx back for sim	2025-01-05 09:46:29 -05:00
Jason Wen	2e7ab9ce85	add to lfs	2025-01-05 09:41:34 -05:00
Jason Wen	0d47532773	need it for default snpe model	2025-01-05 09:07:31 -05:00
Jason Wen	acbcdded4f	don't even compile anymore	2025-01-05 08:45:43 -05:00
Jason Wen	bd3b4dd2e7	Reapply "remove our own" This reverts commit `b1996377b3`.	2025-01-05 08:37:46 -05:00
Jason Wen	a2e30cc7d1	Revert "try using compile2.py again" This reverts commit `914117d2e1`.	2025-01-05 08:37:34 -05:00
Jason Wen	b52347e7b4	Revert "add back symlink" This reverts commit `9f71ad0b8a`.	2025-01-05 08:37:33 -05:00
Jason Wen	36576ad5ad	Revert "fix path" This reverts commit `75d338f2bd`.	2025-01-05 08:37:32 -05:00
Jason Wen	5afa0174c5	Revert "more fix" This reverts commit `23dd423e78`.	2025-01-05 08:37:32 -05:00
Jason Wen	74126eaef8	Revert "wrong path again" This reverts commit `f5301c19d5`.	2025-01-05 08:37:31 -05:00
Jason Wen	b78f14bff3	Reapply "wrong path again" This reverts commit `309639aeb3`.	2025-01-05 08:37:30 -05:00
Jason Wen	c409ac546a	Revert "update" This reverts commit `fb313bd7fb`.	2025-01-05 08:37:30 -05:00
Jason Wen	42af2fbbc2	Revert "hardcode path to our submodule" This reverts commit `5ee1950b6f`.	2025-01-05 08:37:29 -05:00
Jason Wen	8642689c6d	Revert "force path" This reverts commit `5c3b408937`.	2025-01-05 08:37:28 -05:00
Jason Wen	8e6fb8547a	Revert "try this" This reverts commit `41fef87680`.	2025-01-05 08:37:28 -05:00
Jason Wen	0dbb46aa12	Revert "fix file name" This reverts commit `485eef68da`.	2025-01-05 08:37:27 -05:00
Jason Wen	b930a83b8d	Revert "try this" This reverts commit `767f78bbcf`.	2025-01-05 08:37:27 -05:00
Jason Wen	878cec45ad	Revert "again" This reverts commit `17c8cd7376`.	2025-01-05 08:37:26 -05:00
Jason Wen	17c8cd7376	again	2025-01-05 08:34:14 -05:00
Jason Wen	767f78bbcf	try this	2025-01-05 08:30:28 -05:00
Jason Wen	485eef68da	fix file name	2025-01-05 08:27:23 -05:00
Jason Wen	41fef87680	try this	2025-01-05 08:26:23 -05:00
Jason Wen	5c3b408937	force path	2025-01-05 08:21:13 -05:00
Jason Wen	5ee1950b6f	hardcode path to our submodule	2025-01-05 08:12:02 -05:00
Jason Wen	fb313bd7fb	update	2025-01-05 08:09:30 -05:00
Jason Wen	309639aeb3	Revert "wrong path again" This reverts commit `f5301c19d5`.	2025-01-05 08:06:49 -05:00
Jason Wen	f5301c19d5	wrong path again	2025-01-05 08:04:49 -05:00
Jason Wen	23dd423e78	more fix	2025-01-05 07:59:18 -05:00
Jason Wen	75d338f2bd	fix path	2025-01-05 07:56:48 -05:00
Jason Wen	9f71ad0b8a	add back symlink	2025-01-05 07:55:30 -05:00
Jason Wen	914117d2e1	try using compile2.py again	2025-01-05 07:54:34 -05:00
Jason Wen	b1996377b3	Revert "remove our own" This reverts commit `1cf4f57502`.	2025-01-05 07:52:11 -05:00
Jason Wen	158a76289e	try this	2025-01-05 07:35:41 -05:00
Jason Wen	5c125f5fa4	fix thneed	2025-01-05 07:21:11 -05:00
Jason Wen	130ba6b905	use upstream compile3	2025-01-05 06:59:29 -05:00
Jason Wen	1cf4f57502	remove our own	2025-01-05 06:59:19 -05:00
Jason Wen	f9ca110410	mypy	2025-01-05 06:37:08 -05:00
Jason Wen	4bdecdec11	fix thneed paths	2025-01-05 06:32:58 -05:00
Jason Wen	4b6c94e794	Merge branch 'master-new' into model-selector-multi-runner	2025-01-05 06:31:54 -05:00
Jason Wen	59c551ac77	ruff	2025-01-05 06:28:46 -05:00
Jason Wen	c54cc074e2	fix process name	2025-01-05 06:25:18 -05:00
Jason Wen	07391c72b4	ignore tg	2025-01-05 06:20:49 -05:00
DevTekVE	e46aaf0263	Refactor modeld process function checks. Introduce `is_stock_model` to clarify logic and replace direct uses of `is_snpe_model` where the stock model condition is needed. Additionally, rename the duplicate "modeld" process in sunnyPilot to "modeld_snpe" for clarity and consistency.	2025-01-05 12:16:24 +01:00
DevTekVE	f3db1254c3	Adjust modeld execution logic based on active model runner Introduced a check to conditionally execute `modeld` based on the active model runner. Added support for distinguishing between SNPE and TinyGrad runners using new helper functions and updated `custom.capnp` definitions. This change optimizes process management by ensuring compatibility with the selected model runner.	2025-01-05 11:56:31 +01:00
Jason Wen	2c3d776a52	fix more paths	2025-01-05 05:49:31 -05:00
Jason Wen	8516026c74	fix path	2025-01-05 05:35:48 -05:00
Jason Wen	b916e9c655	force with snpe to validate	2025-01-05 05:30:37 -05:00
Jason Wen	15d127889b	tinygrad with snpe	2025-01-05 05:21:25 -05:00