Cleaning up

Bringing what was on master back then
old desired_curv data: MLSIM V1 to Postal Service
2026-07-28 00:32:06 +08:00 · 2025-01-06 22:40:03 +01:00 · 2025-01-06 22:29:55 +01:00 · 2025-01-06 15:35:47 -05:00 · 2025-01-06 15:34:37 -05:00 · 2025-01-06 13:54:32 -05:00
19 changed files with 137 additions and 358 deletions
@@ -15,8 +15,7 @@ class ModelConstants:
  # model inputs constants
  MODEL_FREQ = 20
  FEATURE_LEN = 512
-  FULL_HISTORY_BUFFER_LEN = 99
-  HISTORY_BUFFER_LEN = 24
+  HISTORY_BUFFER_LEN = 99
  DESIRE_LEN = 8
  TRAFFIC_CONVENTION_LEN = 2
  LAT_PLANNER_STATE_LEN = 4
@@ -3,22 +3,11 @@ import capnp
 import numpy as np
 from cereal import log
 from openpilot.sunnypilot.modeld.constants import ModelConstants, Plan, Meta
-from openpilot.selfdrive.controls.lib.drive_helpers import MIN_SPEED

 SEND_RAW_PRED = os.getenv('SEND_RAW_PRED')

 ConfidenceClass = log.ModelDataV2.ConfidenceClass

-def curv_from_psis(psi_target, psi_rate, vego, delay):
-  vego = np.clip(vego, MIN_SPEED, np.inf)
-  curv_from_psi = psi_target / (vego * delay)  # epsilon to prevent divide-by-zero
-  return 2*curv_from_psi - psi_rate / vego
-
-def get_curvature_from_plan(plan, vego, delay):
-  psi_target = np.interp(delay, ModelConstants.T_IDXS, plan[:, Plan.T_FROM_CURRENT_EULER][:, 2])
-  psi_rate = plan[:, Plan.ORIENTATION_RATE][0, 2]
-  return curv_from_psis(psi_target, psi_rate, vego, delay)
-
 class PublishState:
  def __init__(self):
    self.disengage_buffer = np.zeros(ModelConstants.CONFIDENCE_BUFFER_LEN*ModelConstants.DISENGAGE_WIDTH, dtype=np.float32)
@@ -59,24 +48,15 @@ def fill_xyz_poly(builder, degree, x, y, z):
  builder.yCoefficients = coeffs[:, 1].tolist()
  builder.zCoefficients = coeffs[:, 2].tolist()

-def fill_lane_line_meta(builder, lane_lines, lane_line_probs):
-  builder.leftY = lane_lines[1].y[0]
-  builder.leftProb = lane_line_probs[1]
-  builder.rightY = lane_lines[2].y[0]
-  builder.rightProb = lane_line_probs[2]
-
 def fill_model_msg(base_msg: capnp._DynamicStructBuilder, extended_msg: capnp._DynamicStructBuilder,
-                   net_output_data: dict[str, np.ndarray], v_ego: float, delay: float,
-                   publish_state: PublishState, vipc_frame_id: int, vipc_frame_id_extra: int,
-                   frame_id: int, frame_drop: float, timestamp_eof: int, model_execution_time: float,
-                   valid: bool) -> None:
+                   net_output_data: dict[str, np.ndarray], publish_state: PublishState,
+                   vipc_frame_id: int, vipc_frame_id_extra: int, frame_id: int, frame_drop: float,
+                   timestamp_eof: int, model_execution_time: float, valid: bool) -> None:
  frame_age = frame_id - vipc_frame_id if frame_id > vipc_frame_id else 0
  frame_drop_perc = frame_drop * 100
  extended_msg.valid = valid
  base_msg.valid = valid

-  desired_curv = float(get_curvature_from_plan(net_output_data['plan'][0], v_ego, delay))
-
  driving_model_data = base_msg.drivingModelData

  driving_model_data.frameId = vipc_frame_id
@@ -85,7 +65,7 @@ def fill_model_msg(base_msg: capnp._DynamicStructBuilder, extended_msg: capnp._D
  driving_model_data.modelExecutionTime = model_execution_time

  action = driving_model_data.action
-  action.desiredCurvature = desired_curv
+  action.desiredCurvature = float(net_output_data['desired_curvature'][0,0])

  modelV2 = extended_msg.modelV2
  modelV2.frameId = vipc_frame_id
@@ -120,7 +100,7 @@ def fill_model_msg(base_msg: capnp._DynamicStructBuilder, extended_msg: capnp._D

  # lateral planning
  action = modelV2.action
-  action.desiredCurvature = desired_curv
+  action.desiredCurvature = float(net_output_data['desired_curvature'][0,0])

  # times at X_IDXS according to model plan
  PLAN_T_IDXS = [np.nan] * ModelConstants.IDX_N
@@ -150,7 +130,10 @@ def fill_model_msg(base_msg: capnp._DynamicStructBuilder, extended_msg: capnp._D
  modelV2.laneLineProbs = net_output_data['lane_lines_prob'][0,1::2].tolist()

  lane_line_meta = driving_model_data.laneLineMeta
-  fill_lane_line_meta(lane_line_meta, modelV2.laneLines, modelV2.laneLineProbs)
+  lane_line_meta.leftY = modelV2.laneLines[1].y[0]
+  lane_line_meta.leftProb = modelV2.laneLineProbs[1]
+  lane_line_meta.rightY = modelV2.laneLines[2].y[0]
+  lane_line_meta.rightProb = modelV2.laneLineProbs[2]

  # road edges
  modelV2.init('roadEdges', 2)
@@ -17,12 +17,13 @@ from openpilot.common.realtime import config_realtime_process
 from openpilot.common.transformations.camera import DEVICE_CAMERAS
 from openpilot.common.transformations.model import get_warp_matrix
 from openpilot.system import sentry
+from openpilot.selfdrive.car.card import convert_to_capnp
 from openpilot.selfdrive.controls.lib.desire_helper import DesireHelper
 from openpilot.sunnypilot.modeld.runners import ModelRunner, Runtime
 from openpilot.sunnypilot.modeld.parse_model_outputs import Parser
 from openpilot.sunnypilot.modeld.fill_model_msg import fill_model_msg, fill_pose_msg, PublishState
 from openpilot.sunnypilot.modeld.constants import ModelConstants
-from openpilot.sunnypilot.modeld.models.commonmodel_pyx import DrivingModelFrame, CLContext
+from openpilot.sunnypilot.modeld.models.commonmodel_pyx import ModelFrame, CLContext

 PROCESS_NAME = "sunnypilot.modeld.modeld"
 SEND_RAW_PRED = os.getenv('SEND_RAW_PRED')
@@ -33,7 +34,6 @@ MODEL_PATHS = {

 METADATA_PATH = Path(__file__).parent / 'models/supercombo_metadata.pkl'

-
 class FrameMeta:
  frame_id: int = 0
  timestamp_sof: int = 0
@@ -44,24 +44,22 @@ class FrameMeta:
      self.frame_id, self.timestamp_sof, self.timestamp_eof = vipc.frame_id, vipc.timestamp_sof, vipc.timestamp_eof

 class ModelState:
-  frame: DrivingModelFrame
-  wide_frame: DrivingModelFrame
+  frame: ModelFrame
+  wide_frame: ModelFrame
  inputs: dict[str, np.ndarray]
  output: np.ndarray
  prev_desire: np.ndarray  # for tracking the rising edge of the pulse
  model: ModelRunner

  def __init__(self, context: CLContext):
-    self.frame = DrivingModelFrame(context)
-    self.wide_frame = DrivingModelFrame(context)
+    self.frame = ModelFrame(context)
+    self.wide_frame = ModelFrame(context)
    self.prev_desire = np.zeros(ModelConstants.DESIRE_LEN, dtype=np.float32)
-    self.full_features_20Hz = np.zeros((ModelConstants.FULL_HISTORY_BUFFER_LEN, ModelConstants.FEATURE_LEN), dtype=np.float32)
-    self.desire_20Hz =  np.zeros((ModelConstants.FULL_HISTORY_BUFFER_LEN + 1, ModelConstants.DESIRE_LEN), dtype=np.float32)
-
-    # img buffers are managed in openCL transform code
    self.inputs = {
      'desire': np.zeros(ModelConstants.DESIRE_LEN * (ModelConstants.HISTORY_BUFFER_LEN+1), dtype=np.float32),
      'traffic_convention': np.zeros(ModelConstants.TRAFFIC_CONVENTION_LEN, dtype=np.float32),
+      'lateral_control_params': np.zeros(ModelConstants.LATERAL_CONTROL_PARAMS_LEN, dtype=np.float32),
+      'prev_desired_curv': np.zeros(ModelConstants.PREV_DESIRED_CURV_LEN * (ModelConstants.HISTORY_BUFFER_LEN+1), dtype=np.float32),
      'features_buffer': np.zeros(ModelConstants.HISTORY_BUFFER_LEN * ModelConstants.FEATURE_LEN, dtype=np.float32),
    }

@@ -89,17 +87,17 @@ class ModelState:
                inputs: dict[str, np.ndarray], prepare_only: bool) -> dict[str, np.ndarray] | None:
    # Model decides when action is completed, so desire input is just a pulse triggered on rising edge
    inputs['desire'][0] = 0
-    new_desire = np.where(inputs['desire'] - self.prev_desire > .99, inputs['desire'], 0)
+    self.inputs['desire'][:-ModelConstants.DESIRE_LEN] = self.inputs['desire'][ModelConstants.DESIRE_LEN:]
+    self.inputs['desire'][-ModelConstants.DESIRE_LEN:] = np.where(inputs['desire'] - self.prev_desire > .99, inputs['desire'], 0)
    self.prev_desire[:] = inputs['desire']

-    self.desire_20Hz[:-1] = self.desire_20Hz[1:]
-    self.desire_20Hz[-1] = new_desire
-    self.inputs['desire'][:] = self.desire_20Hz.reshape((25,4,-1)).max(axis=1).flatten()
-
    self.inputs['traffic_convention'][:] = inputs['traffic_convention']
+    self.inputs['lateral_control_params'][:] = inputs['lateral_control_params']

+    # if getCLBuffer is not None, frame will be None
    self.model.setInputBuffer("input_imgs", self.frame.prepare(buf, transform.flatten(), self.model.getCLBuffer("input_imgs")))
-    self.model.setInputBuffer("big_input_imgs", self.wide_frame.prepare(wbuf, transform_wide.flatten(), self.model.getCLBuffer("big_input_imgs")))
+    if wbuf is not None:
+      self.model.setInputBuffer("big_input_imgs", self.wide_frame.prepare(wbuf, transform_wide.flatten(), self.model.getCLBuffer("big_input_imgs")))

    if prepare_only:
      return None
@@ -107,11 +105,10 @@ class ModelState:
    self.model.execute()
    outputs = self.parser.parse_outputs(self.slice_outputs(self.output))

-    self.full_features_20Hz[:-1] = self.full_features_20Hz[1:]
-    self.full_features_20Hz[-1] = outputs['hidden_state'][0, :]
-
-    idxs = np.arange(-4,-100,-4)[::-1]
-    self.inputs['features_buffer'][:] = self.full_features_20Hz[idxs].flatten()
+    self.inputs['features_buffer'][:-ModelConstants.FEATURE_LEN] = self.inputs['features_buffer'][ModelConstants.FEATURE_LEN:]
+    self.inputs['features_buffer'][-ModelConstants.FEATURE_LEN:] = outputs['hidden_state'][0, :]
+    self.inputs['prev_desired_curv'][:-ModelConstants.PREV_DESIRED_CURV_LEN] = self.inputs['prev_desired_curv'][ModelConstants.PREV_DESIRED_CURV_LEN:]
+    self.inputs['prev_desired_curv'][-ModelConstants.PREV_DESIRED_CURV_LEN:] = outputs['desired_curvature'][0, :]
    return outputs


@@ -174,9 +171,10 @@ def main(demo=False):


  if demo:
-    CP = get_demo_car_params()
+    CP = convert_to_capnp(get_demo_car_params())
  else:
    CP = messaging.log_from_bytes(params.get("CarParams", block=True), car.CarParams)
+
  cloudlog.info("modeld got CarParams: %s", CP.carName)

  # TODO this needs more thought, use .2s extra for now to estimate other delays
@@ -221,7 +219,7 @@ def main(demo=False):
    desire = DH.desire
    is_rhd = sm["driverMonitoringState"].isRHD
    frame_id = sm["roadCameraState"].frameId
-    v_ego = max(sm["carState"].vEgo, 0.)
+    lateral_control_params = np.array([sm["carState"].vEgo, steer_delay], dtype=np.float32)
    if sm.updated["liveCalibration"] and sm.seen['roadCameraState'] and sm.seen['deviceState']:
      device_from_calib_euler = np.array(sm["liveCalibration"].rpyCalib, dtype=np.float32)
      dc = DEVICE_CAMERAS[(str(sm['deviceState'].deviceType), str(sm['roadCameraState'].sensor))]
@@ -252,6 +250,7 @@ def main(demo=False):
    inputs:dict[str, np.ndarray] = {
      'desire': vec_desire,
      'traffic_convention': traffic_convention,
+      'lateral_control_params': lateral_control_params,
      }

    mt1 = time.perf_counter()
@@ -263,8 +262,7 @@ def main(demo=False):
      modelv2_send = messaging.new_message('modelV2')
      drivingdata_send = messaging.new_message('drivingModelData')
      posenet_send = messaging.new_message('cameraOdometry')
-      fill_model_msg(drivingdata_send, modelv2_send, model_output, v_ego, steer_delay,
-                     publish_state, meta_main.frame_id, meta_extra.frame_id, frame_id,
+      fill_model_msg(drivingdata_send, modelv2_send, model_output, publish_state, meta_main.frame_id, meta_extra.frame_id, frame_id,
                     frame_drop_ratio, meta_main.timestamp_eof, model_execution_time, live_calib_seen)

      desire_state = modelv2_send.modelV2.meta.desireState
@@ -1,62 +0,0 @@
-## Neural networks in openpilot
-To view the architecture of the ONNX networks, you can use [netron](https://netron.app/)
-
-## Supercombo
-### Supercombo input format (Full size: 799906 x float32)
-* **image stream**
-  * Two consecutive images (256 * 512 * 3 in RGB) recorded at 20 Hz : 393216 = 2 * 6 * 128 * 256
-    * Each 256 * 512 image is represented in YUV420 with 6 channels : 6 * 128 * 256
-      * Channels 0,1,2,3 represent the full-res Y channel and are represented in numpy as Y[::2, ::2], Y[::2, 1::2], Y[1::2, ::2], and Y[1::2, 1::2]
-      * Channel 4 represents the half-res U channel
-      * Channel 5 represents the half-res V channel
-* **wide image stream**
-  * Two consecutive images (256 * 512 * 3 in RGB) recorded at 20 Hz : 393216 = 2 * 6 * 128 * 256
-    * Each 256 * 512 image is represented in YUV420 with 6 channels : 6 * 128 * 256
-      * Channels 0,1,2,3 represent the full-res Y channel and are represented in numpy as Y[::2, ::2], Y[::2, 1::2], Y[1::2, ::2], and Y[1::2, 1::2]
-      * Channel 4 represents the half-res U channel
-      * Channel 5 represents the half-res V channel
-* **desire**
-  * one-hot encoded buffer to command model to execute certain actions, bit needs to be sent for the past 5 seconds (at 20FPS) : 100 * 8
-* **traffic convention**
-  * one-hot encoded vector to tell model whether traffic is right-hand or left-hand traffic : 2
-* **feature buffer**
-  * A buffer of intermediate features that gets appended to the current feature to form a 5 seconds temporal context (at 20FPS) : 99 * 512
-
-
-### Supercombo output format (Full size: XXX x float32)
-Read [here](https://github.com/commaai/openpilot/blob/90af436a121164a51da9fa48d093c29f738adf6a/selfdrive/modeld/models/driving.h#L236) for more.
-
-
-## Driver Monitoring Model
-* .onnx model can be run with onnx runtimes
-* .dlc file is a pre-quantized model and only runs on qualcomm DSPs
-
-### input format
-* single image W = 1440 H = 960 luminance channel (Y) from the planar YUV420 format:
-  * full input size is 1440 * 960 = 1382400
-  * normalized ranging from 0.0 to 1.0 in float32 (onnx runner) or ranging from 0 to 255 in uint8 (snpe runner)
-* camera calibration angles (roll, pitch, yaw) from liveCalibration: 3 x float32 inputs
-
-### output format
-* 84 x float32 outputs = 2 + 41 * 2 ([parsing example](https://github.com/commaai/openpilot/blob/22ce4e17ba0d3bfcf37f8255a4dd1dc683fe0c38/selfdrive/modeld/models/dmonitoring.cc#L33))
-  * for each person in the front seats (2 * 41)
-    * face pose: 12 = 6 + 6
-      * face orientation [pitch, yaw, roll] in camera frame: 3
-      * face position [dx, dy] relative to image center: 2
-      * normalized face size: 1
-      * standard deviations for above outputs: 6
-    * face visible probability: 1
-    * eyes: 20 = (8 + 1) + (8 + 1) + 1 + 1
-      * eye position and size, and their standard deviations: 8
-      * eye visible probability: 1
-      * eye closed probability: 1
-    * wearing sunglasses probability: 1
-    * face occluded probability: 1
-    * touching wheel probability: 1
-    * paying attention probability: 1
-    * (deprecated) distracted probabilities: 2
-    * using phone probability: 1
-    * distracted probability: 1
-  * common outputs 2
-    * poor camera vision probability: 1
-    * left hand drive probability: 1
@@ -1,69 +1,50 @@
 #include "sunnypilot/modeld/models/commonmodel.h"

+#include <cassert>
 #include <cmath>
 #include <cstring>

 #include "common/clutil.h"

-DrivingModelFrame::DrivingModelFrame(cl_device_id device_id, cl_context context) : ModelFrame(device_id, context) {
-  input_frames = std::make_unique<uint8_t[]>(buf_size);
-  //input_frames_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, buf_size, NULL, &err));
-  img_buffer_20hz_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, 5*frame_size_bytes, NULL, &err));
-  region.origin = 4 * frame_size_bytes;
-  region.size = frame_size_bytes;
-  last_img_cl = CL_CHECK_ERR(clCreateSubBuffer(img_buffer_20hz_cl, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, &region, &err));
+ModelFrame::ModelFrame(cl_device_id device_id, cl_context context) {
+  input_frames = std::make_unique<float[]>(buf_size);

+  q = CL_CHECK_ERR(clCreateCommandQueue(context, device_id, 0, &err));
+  y_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, MODEL_WIDTH * MODEL_HEIGHT, NULL, &err));
+  u_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, (MODEL_WIDTH / 2) * (MODEL_HEIGHT / 2), NULL, &err));
+  v_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, (MODEL_WIDTH / 2) * (MODEL_HEIGHT / 2), NULL, &err));
+  net_input_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, MODEL_FRAME_SIZE * sizeof(float), NULL, &err));
+
+  transform_init(&transform, context, device_id);
  loadyuv_init(&loadyuv, context, device_id, MODEL_WIDTH, MODEL_HEIGHT);
-  init_transform(device_id, context, MODEL_WIDTH, MODEL_HEIGHT);
 }

-uint8_t* DrivingModelFrame::prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection, cl_mem* output) {
-  run_transform(yuv_cl, MODEL_WIDTH, MODEL_HEIGHT, frame_width, frame_height, frame_stride, frame_uv_offset, projection);
-
-  for (int i = 0; i < 4; i++) {
-    CL_CHECK(clEnqueueCopyBuffer(q, img_buffer_20hz_cl, img_buffer_20hz_cl, (i+1)*frame_size_bytes, i*frame_size_bytes, frame_size_bytes, 0, nullptr, nullptr));
-  }
-  loadyuv_queue(&loadyuv, q, y_cl, u_cl, v_cl, last_img_cl);
+float* ModelFrame::prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3 &projection, cl_mem *output) {
+  transform_queue(&this->transform, q,
+                  yuv_cl, frame_width, frame_height, frame_stride, frame_uv_offset,
+                  y_cl, u_cl, v_cl, MODEL_WIDTH, MODEL_HEIGHT, projection);

  if (output == NULL) {
-    CL_CHECK(clEnqueueReadBuffer(q, img_buffer_20hz_cl, CL_TRUE, 0, frame_size_bytes, &input_frames[0], 0, nullptr, nullptr));
-    CL_CHECK(clEnqueueReadBuffer(q, last_img_cl, CL_TRUE, 0, frame_size_bytes, &input_frames[MODEL_FRAME_SIZE], 0, nullptr, nullptr));
+    loadyuv_queue(&loadyuv, q, y_cl, u_cl, v_cl, net_input_cl);
+
+    std::memmove(&input_frames[0], &input_frames[MODEL_FRAME_SIZE], sizeof(float) * MODEL_FRAME_SIZE);
+    CL_CHECK(clEnqueueReadBuffer(q, net_input_cl, CL_TRUE, 0, MODEL_FRAME_SIZE * sizeof(float), &input_frames[MODEL_FRAME_SIZE], 0, nullptr, nullptr));
    clFinish(q);
    return &input_frames[0];
  } else {
-    copy_queue(&loadyuv, q, img_buffer_20hz_cl, *output, 0, 0, frame_size_bytes);
-    copy_queue(&loadyuv, q, last_img_cl, *output, 0, frame_size_bytes, frame_size_bytes);
-
+    loadyuv_queue(&loadyuv, q, y_cl, u_cl, v_cl, *output, true);
    // NOTE: Since thneed is using a different command queue, this clFinish is needed to ensure the image is ready.
    clFinish(q);
    return NULL;
  }
 }

-DrivingModelFrame::~DrivingModelFrame() {
-  deinit_transform();
+ModelFrame::~ModelFrame() {
+  transform_destroy(&transform);
  loadyuv_destroy(&loadyuv);
-  CL_CHECK(clReleaseMemObject(img_buffer_20hz_cl));
-  CL_CHECK(clReleaseMemObject(last_img_cl));
+  CL_CHECK(clReleaseMemObject(net_input_cl));
+  CL_CHECK(clReleaseMemObject(v_cl));
+  CL_CHECK(clReleaseMemObject(u_cl));
+  CL_CHECK(clReleaseMemObject(y_cl));
  CL_CHECK(clReleaseCommandQueue(q));
-}
-
-
-MonitoringModelFrame::MonitoringModelFrame(cl_device_id device_id, cl_context context) : ModelFrame(device_id, context) {
-  input_frames = std::make_unique<uint8_t[]>(buf_size);
-  //input_frame_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, buf_size, NULL, &err));
-
-  init_transform(device_id, context, MODEL_WIDTH, MODEL_HEIGHT);
-}
-uint8_t* MonitoringModelFrame::prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection, cl_mem* output) {
-  run_transform(yuv_cl, MODEL_WIDTH, MODEL_HEIGHT, frame_width, frame_height, frame_stride, frame_uv_offset, projection);
-  CL_CHECK(clEnqueueReadBuffer(q, y_cl, CL_TRUE, 0, MODEL_FRAME_SIZE * sizeof(uint8_t), input_frames.get(), 0, nullptr, nullptr));
-  clFinish(q);
-  //return &y_cl;
-  return input_frames.get();
-}
-
-MonitoringModelFrame::~MonitoringModelFrame() {
-  deinit_transform();
-  CL_CHECK(clReleaseCommandQueue(q));
-}
+}
@@ -2,7 +2,6 @@

 #include <cfloat>
 #include <cstdlib>
-#include <cassert>

 #include <memory>

@@ -14,85 +13,24 @@
 #endif

 #include "common/mat.h"
-#include "selfdrive/modeld/transforms/loadyuv.h"
-#include "selfdrive/modeld/transforms/transform.h"
+#include "sunnypilot/modeld/transforms/loadyuv.h"
+#include "sunnypilot/modeld/transforms/transform.h"

 class ModelFrame {
 public:
-  ModelFrame(cl_device_id device_id, cl_context context) {
-    q = CL_CHECK_ERR(clCreateCommandQueue(context, device_id, 0, &err));
-  }
-  virtual ~ModelFrame() {}
-  virtual uint8_t* prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection, cl_mem* output) { return NULL; }
-  /*
-  uint8_t* buffer_from_cl(cl_mem *in_frames, int buffer_size) {
-    CL_CHECK(clEnqueueReadBuffer(q, *in_frames, CL_TRUE, 0, buffer_size, input_frames.get(), 0, nullptr, nullptr));
-    clFinish(q);
-    return &input_frames[0];
-  }
-  */
-
-  int MODEL_WIDTH;
-  int MODEL_HEIGHT;
-  int MODEL_FRAME_SIZE;
-  int buf_size;
-
-protected:
-  cl_mem y_cl, u_cl, v_cl;
-  Transform transform;
-  cl_command_queue q;
-  std::unique_ptr<uint8_t[]> input_frames;
-
-  void init_transform(cl_device_id device_id, cl_context context, int model_width, int model_height) {
-    y_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, model_width * model_height, NULL, &err));
-    u_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, (model_width / 2) * (model_height / 2), NULL, &err));
-    v_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, (model_width / 2) * (model_height / 2), NULL, &err));
-    transform_init(&transform, context, device_id);
-  }
-
-  void deinit_transform() {
-    transform_destroy(&transform);
-    CL_CHECK(clReleaseMemObject(v_cl));
-    CL_CHECK(clReleaseMemObject(u_cl));
-    CL_CHECK(clReleaseMemObject(y_cl));
-  }
-
-  void run_transform(cl_mem yuv_cl, int model_width, int model_height, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection) {
-    transform_queue(&transform, q,
-        yuv_cl, frame_width, frame_height, frame_stride, frame_uv_offset,
-        y_cl, u_cl, v_cl, model_width, model_height, projection);
-  }
-};
-
-class DrivingModelFrame : public ModelFrame {
-public:
-  DrivingModelFrame(cl_device_id device_id, cl_context context);
-  ~DrivingModelFrame();
-  uint8_t* prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection, cl_mem* output);
+  ModelFrame(cl_device_id device_id, cl_context context);
+  ~ModelFrame();
+  float* prepare(cl_mem yuv_cl, int width, int height, int frame_stride, int frame_uv_offset, const mat3& transform, cl_mem *output);

  const int MODEL_WIDTH = 512;
  const int MODEL_HEIGHT = 256;
  const int MODEL_FRAME_SIZE = MODEL_WIDTH * MODEL_HEIGHT * 3 / 2;
  const int buf_size = MODEL_FRAME_SIZE * 2;
-  const size_t frame_size_bytes = MODEL_FRAME_SIZE * sizeof(uint8_t);

 private:
+  Transform transform;
  LoadYUVState loadyuv;
-  cl_mem img_buffer_20hz_cl, last_img_cl;//, input_frames_cl;
-  cl_buffer_region region;
-};
-
-class MonitoringModelFrame : public ModelFrame {
-public:
-  MonitoringModelFrame(cl_device_id device_id, cl_context context);
-  ~MonitoringModelFrame();
-  uint8_t* prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection, cl_mem* output);
-
-  const int MODEL_WIDTH = 1440;
-  const int MODEL_HEIGHT = 960;
-  const int MODEL_FRAME_SIZE = MODEL_WIDTH * MODEL_HEIGHT;
-  const int buf_size = MODEL_FRAME_SIZE;
-
-private:
-  // cl_mem input_frame_cl;
+  cl_command_queue q;
+  cl_mem y_cl, u_cl, v_cl, net_input_cl;
+  std::unique_ptr<float[]> input_frames;
 };
@@ -14,13 +14,5 @@ cdef extern from "common/clutil.h":
 cdef extern from "sunnypilot/modeld/models/commonmodel.h":
  cppclass ModelFrame:
    int buf_size
-    # unsigned char * buffer_from_cl(cl_mem*, int);
-    unsigned char * prepare(cl_mem, int, int, int, int, mat3, cl_mem*)
-
-  cppclass DrivingModelFrame:
-    int buf_size
-    DrivingModelFrame(cl_device_id, cl_context)
-
-  cppclass MonitoringModelFrame:
-    int buf_size
-    MonitoringModelFrame(cl_device_id, cl_context)
+    ModelFrame(cl_device_id, cl_context)
+    float * prepare(cl_mem, int, int, int, int, mat3, cl_mem*)
@@ -4,12 +4,11 @@
 import numpy as np
 cimport numpy as cnp
 from libc.string cimport memcpy
-from libc.stdint cimport uintptr_t

 from msgq.visionipc.visionipc cimport cl_mem
 from msgq.visionipc.visionipc_pyx cimport VisionBuf, CLContext as BaseCLContext
-from sunnypilot.modeld.models.commonmodel cimport CL_DEVICE_TYPE_DEFAULT, cl_get_device_id, cl_create_context
-from sunnypilot.modeld.models.commonmodel cimport mat3, ModelFrame as cppModelFrame, DrivingModelFrame as cppDrivingModelFrame, MonitoringModelFrame as cppMonitoringModelFrame
+from .commonmodel cimport CL_DEVICE_TYPE_DEFAULT, cl_get_device_id, cl_create_context
+from .commonmodel cimport mat3, ModelFrame as cppModelFrame


 cdef class CLContext(BaseCLContext):
@@ -24,17 +23,11 @@ cdef class CLMem:
    mem.mem = <cl_mem*> cmem
    return mem

-  @property
-  def mem_address(self):
-    return <uintptr_t>(self.mem)
-
-def cl_from_visionbuf(VisionBuf buf):
-  return CLMem.create(<void*>&buf.buf.buf_cl)
-
-
 cdef class ModelFrame:
  cdef cppModelFrame * frame
-  cdef int buf_size
+
+  def __cinit__(self, CLContext context):
+    self.frame = new cppModelFrame(context.device_id, context.context)

  def __dealloc__(self):
    del self.frame
@@ -42,35 +35,11 @@ cdef class ModelFrame:
  def prepare(self, VisionBuf buf, float[:] projection, CLMem output):
    cdef mat3 cprojection
    memcpy(cprojection.v, &projection[0], 9*sizeof(float))
-    cdef unsigned char * data
+    cdef float * data
    if output is None:
      data = self.frame.prepare(buf.buf.buf_cl, buf.width, buf.height, buf.stride, buf.uv_offset, cprojection, NULL)
    else:
      data = self.frame.prepare(buf.buf.buf_cl, buf.width, buf.height, buf.stride, buf.uv_offset, cprojection, output.mem)
    if not data:
      return None
-
-    return np.asarray(<cnp.uint8_t[:self.buf_size]> data)
-    # return CLMem.create(data)
-
-  # def buffer_from_cl(self, CLMem in_frames):
-  #   cdef unsigned char * data2
-  #   data2 = self.frame.buffer_from_cl(in_frames.mem, self.buf_size)
-  #   return np.asarray(<cnp.uint8_t[:self.buf_size]> data2)
-
-
-cdef class DrivingModelFrame(ModelFrame):
-  cdef cppDrivingModelFrame * _frame
-
-  def __cinit__(self, CLContext context):
-    self._frame = new cppDrivingModelFrame(context.device_id, context.context)
-    self.frame = <cppModelFrame*>(self._frame)
-    self.buf_size = self._frame.buf_size
-
-cdef class MonitoringModelFrame(ModelFrame):
-  cdef cppMonitoringModelFrame * _frame
-
-  def __cinit__(self, CLContext context):
-    self._frame = new cppMonitoringModelFrame(context.device_id, context.context)
-    self.frame = <cppModelFrame*>(self._frame)
-    self.buf_size = self._frame.buf_size
+    return np.asarray(<cnp.float32_t[:self.frame.buf_size]> data)
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0c896681fd6851de3968433e12f37834429eba265e938cf383200be3e5835cec
-size 49096168
+oid sha256:fb2018c74cdd9e5cb070ec7bed7f8581fabd55e39057d0a03aaffd2e42408154
+size 62486347
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:af2cb689ec9e31292f759b561e70e4558a38f778558dff39ccff460ccafc0d52
-size 49849624
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:441f2865017c07ee0dfb2488c5d86aab00df7ff5c5ec163959f35c33d74b65e6
-size 594
@@ -96,6 +96,8 @@ class Parser:
                   out_shape=(ModelConstants.LEAD_TRAJ_LEN,ModelConstants.LEAD_WIDTH))
    if 'lat_planner_solution' in outs:
      self.parse_mdn('lat_planner_solution', outs, in_N=0, out_N=0, out_shape=(ModelConstants.IDX_N,ModelConstants.LAT_PLANNER_SOLUTION_WIDTH))
+    if 'desired_curvature' in outs:
+      self.parse_mdn('desired_curvature', outs, in_N=0, out_N=0, out_shape=(ModelConstants.DESIRED_CURV_WIDTH,))
    for k in ['lead_prob', 'lane_lines_prob', 'meta']:
      self.parse_binary_crossentropy(k, outs)
    self.parse_categorical_crossentropy('desire_state', outs, out_shape=(ModelConstants.DESIRE_PRED_WIDTH,))
@@ -1,12 +1,39 @@
-import os
 import onnx
+import itertools
+import os
 import sys
 import numpy as np
 from typing import Any

 from openpilot.sunnypilot.modeld.runners.runmodel_pyx import RunModel
-from openpilot.sunnypilot.modeld.runners.ort_helpers import convert_fp16_to_fp32, ORT_TYPES_TO_NP_TYPES

+ORT_TYPES_TO_NP_TYPES = {'tensor(float16)': np.float16, 'tensor(float)': np.float32, 'tensor(uint8)': np.uint8}
+
+def attributeproto_fp16_to_fp32(attr):
+  float32_list = np.frombuffer(attr.raw_data, dtype=np.float16)
+  attr.data_type = 1
+  attr.raw_data = float32_list.astype(np.float32).tobytes()
+
+def convert_fp16_to_fp32(onnx_path_or_bytes):
+  if isinstance(onnx_path_or_bytes, bytes):
+    model = onnx.load_from_string(onnx_path_or_bytes)
+  elif isinstance(onnx_path_or_bytes, str):
+    model = onnx.load(onnx_path_or_bytes)
+
+  for i in model.graph.initializer:
+    if i.data_type == 10:
+      attributeproto_fp16_to_fp32(i)
+  for i in itertools.chain(model.graph.input, model.graph.output):
+    if i.type.tensor_type.elem_type == 10:
+      i.type.tensor_type.elem_type = 1
+  for i in model.graph.node:
+    if i.op_type == 'Cast' and i.attribute[0].i == 10:
+      i.attribute[0].i = 1
+    for a in i.attribute:
+      if hasattr(a, 't'):
+        if a.t.data_type == 10:
+          attributeproto_fp16_to_fp32(a.t)
+  return model.SerializeToString()

 def create_ort_session(path, fp16_to_fp32):
  os.environ["OMP_NUM_THREADS"] = "4"
@@ -22,14 +49,14 @@ def create_ort_session(path, fp16_to_fp32):
    provider = 'OpenVINOExecutionProvider'
  elif 'CUDAExecutionProvider' in ort.get_available_providers() and 'ONNXCPU' not in os.environ:
    options.intra_op_num_threads = 2
-    provider = ('CUDAExecutionProvider', {'cudnn_conv_algo_search': 'EXHAUSTIVE'})
+    provider = ('CUDAExecutionProvider', {'cudnn_conv_algo_search': 'DEFAULT'})
  else:
    options.intra_op_num_threads = 2
    options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
    options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
    provider = 'CPUExecutionProvider'

-  model_data = convert_fp16_to_fp32(onnx.load(path)) if fp16_to_fp32 else path
+  model_data = convert_fp16_to_fp32(path) if fp16_to_fp32 else path
  print("Onnx selected provider: ", [provider], file=sys.stderr)
  ort_session = ort.InferenceSession(model_data, options, providers=[provider])
  print("Onnx using ", ort_session.get_providers(), file=sys.stderr)
@@ -1,36 +0,0 @@
-import onnx
-import onnxruntime as ort
-import numpy as np
-import itertools
-
-ORT_TYPES_TO_NP_TYPES = {'tensor(float16)': np.float16, 'tensor(float)': np.float32, 'tensor(uint8)': np.uint8}
-
-def attributeproto_fp16_to_fp32(attr):
-  float32_list = np.frombuffer(attr.raw_data, dtype=np.float16)
-  attr.data_type = 1
-  attr.raw_data = float32_list.astype(np.float32).tobytes()
-
-def convert_fp16_to_fp32(model):
-  for i in model.graph.initializer:
-    if i.data_type == 10:
-      attributeproto_fp16_to_fp32(i)
-  for i in itertools.chain(model.graph.input, model.graph.output):
-    if i.type.tensor_type.elem_type == 10:
-      i.type.tensor_type.elem_type = 1
-  for i in model.graph.node:
-    if i.op_type == 'Cast' and i.attribute[0].i == 10:
-      i.attribute[0].i = 1
-    for a in i.attribute:
-      if hasattr(a, 't'):
-        if a.t.data_type == 10:
-          attributeproto_fp16_to_fp32(a.t)
-  return model.SerializeToString()
-
-
-def make_onnx_cpu_runner(model_path):
-  options = ort.SessionOptions()
-  options.intra_op_num_threads = 4
-  options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
-  options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
-  model_data = convert_fp16_to_fp32(onnx.load(model_path))
-  return ort.InferenceSession(model_data,  options, providers=['CPUExecutionProvider'])
@@ -1,4 +1,4 @@
-#include "selfdrive/modeld/transforms/loadyuv.h"
+#include "sunnypilot/modeld/transforms/loadyuv.h"

 #include <cassert>
 #include <cstdio>
@@ -33,8 +33,17 @@ void loadyuv_destroy(LoadYUVState* s) {

 void loadyuv_queue(LoadYUVState* s, cl_command_queue q,
                   cl_mem y_cl, cl_mem u_cl, cl_mem v_cl,
-                   cl_mem out_cl) {
+                   cl_mem out_cl, bool do_shift) {
  cl_int global_out_off = 0;
+  if (do_shift) {
+    // shift the image in slot 1 to slot 0, then place the new image in slot 1
+    global_out_off += (s->width*s->height) + (s->width/2)*(s->height/2)*2;
+    CL_CHECK(clSetKernelArg(s->copy_krnl, 0, sizeof(cl_mem), &out_cl));
+    CL_CHECK(clSetKernelArg(s->copy_krnl, 1, sizeof(cl_int), &global_out_off));
+    const size_t copy_work_size = global_out_off/8;
+    CL_CHECK(clEnqueueNDRangeKernel(q, s->copy_krnl, 1, NULL,
+                                &copy_work_size, NULL, 0, 0, NULL));
+  }

  CL_CHECK(clSetKernelArg(s->loadys_krnl, 0, sizeof(cl_mem), &y_cl));
  CL_CHECK(clSetKernelArg(s->loadys_krnl, 1, sizeof(cl_mem), &out_cl));
@@ -63,14 +72,3 @@ void loadyuv_queue(LoadYUVState* s, cl_command_queue q,
  CL_CHECK(clEnqueueNDRangeKernel(q, s->loaduv_krnl, 1, NULL,
                               &loaduv_work_size, NULL, 0, 0, NULL));
 }
-
-void copy_queue(LoadYUVState* s, cl_command_queue q, cl_mem src, cl_mem dst,
-                 size_t src_offset, size_t dst_offset, size_t size) {
-  CL_CHECK(clSetKernelArg(s->copy_krnl, 0, sizeof(cl_mem), &src));
-  CL_CHECK(clSetKernelArg(s->copy_krnl, 1, sizeof(cl_mem), &dst));
-  CL_CHECK(clSetKernelArg(s->copy_krnl, 2, sizeof(cl_int), &src_offset));
-  CL_CHECK(clSetKernelArg(s->copy_krnl, 3, sizeof(cl_int), &dst_offset));
-  const size_t copy_work_size = size/8;
-  CL_CHECK(clEnqueueNDRangeKernel(q, s->copy_krnl, 1, NULL,
-                              &copy_work_size, NULL, 0, 0, NULL));
-}
@@ -1,7 +1,7 @@
 #define UV_SIZE ((TRANSFORMED_WIDTH/2)*(TRANSFORMED_HEIGHT/2))

 __kernel void loadys(__global uchar8 const * const Y,
-                     __global uchar * out,
+                     __global float * out,
                     int out_offset)
 {
    const int gid = get_global_id(0);
@@ -10,12 +10,13 @@ __kernel void loadys(__global uchar8 const * const Y,
    const int ox = ois % TRANSFORMED_WIDTH;

    const uchar8 ys = Y[gid];
+    const float8 ysf = convert_float8(ys);

    // 02
    // 13

-    __global uchar* outy0;
-    __global uchar* outy1;
+    __global float* outy0;
+    __global float* outy1;
    if ((oy & 1) == 0) {
      outy0 = out + out_offset; //y0
      outy1 = out + out_offset + UV_SIZE*2; //y2
@@ -24,24 +25,23 @@ __kernel void loadys(__global uchar8 const * const Y,
      outy1 = out + out_offset + UV_SIZE*3; //y3
    }

-    vstore4(ys.s0246, 0, outy0 + (oy/2) * (TRANSFORMED_WIDTH/2) + ox/2);
-    vstore4(ys.s1357, 0, outy1 + (oy/2) * (TRANSFORMED_WIDTH/2) + ox/2);
+    vstore4(ysf.s0246, 0, outy0 + (oy/2) * (TRANSFORMED_WIDTH/2) + ox/2);
+    vstore4(ysf.s1357, 0, outy1 + (oy/2) * (TRANSFORMED_WIDTH/2) + ox/2);
 }

 __kernel void loaduv(__global uchar8 const * const in,
-                     __global uchar8 * out,
+                     __global float8 * out,
                     int out_offset)
 {
  const int gid = get_global_id(0);
  const uchar8 inv = in[gid];
-  out[gid + out_offset / 8] = inv;
+  const float8 outv  = convert_float8(inv);
+  out[gid + out_offset / 8] = outv;
 }

-__kernel void copy(__global uchar8 * in,
-                   __global uchar8 * out,
-                   int in_offset,
-                   int out_offset)
+__kernel void copy(__global float8 * inout,
+                   int in_offset)
 {
  const int gid = get_global_id(0);
-  out[gid + out_offset / 8] = in[gid + in_offset / 8];
+  inout[gid] = inout[gid + in_offset / 8];
 }
@@ -13,8 +13,4 @@ void loadyuv_destroy(LoadYUVState* s);

 void loadyuv_queue(LoadYUVState* s, cl_command_queue q,
                   cl_mem y_cl, cl_mem u_cl, cl_mem v_cl,
-                   cl_mem out_cl);
-
-
-void copy_queue(LoadYUVState* s, cl_command_queue q, cl_mem src, cl_mem dst,
-                 size_t src_offset, size_t dst_offset, size_t size);
+                   cl_mem out_cl, bool do_shift = false);
@@ -1,4 +1,4 @@
-#include "selfdrive/modeld/transforms/transform.h"
+#include "sunnypilot/modeld/transforms/transform.h"

 #include <cassert>
 #include <cstring>
Author	SHA1	Message	Date
DevTekVE	6e5b17a660	Cleaning up	2025-01-06 22:40:03 +01:00
DevTekVE	8f56d4d80b	Bringing what was on master back then	2025-01-06 22:29:55 +01:00
Jason Wen	265a1a0ada	old desired_curv data: MLSIM V1 to Postal Service	2025-01-06 15:35:47 -05:00
Jason Wen	0fca387821	lateral_control_params & prev_desired_curv: MLSIM V0 to Null Pointer	2025-01-06 15:34:37 -05:00
Jason Wen	d957a92fbf	cherry pick from devtekve as base	2025-01-06 13:54:32 -05:00
Jason Wen	5809ab3baa	load model and metadata dynamically	2025-01-06 00:46:45 -05:00
Jason Wen	570789a179	parse inputs via metadata	2025-01-06 00:09:52 -05:00