Refactor modeld for flexible input rates and output parsing.

Add support for both 20Hz and variable input rates in `modeld.py` by introducing conditional data updates based on a new `is_20hz` flag. Additionally, enhance `parse_outputs` in `parse_model_outputs.py` to handle `desired_curvature` parsing when relevant input keys are present.
Refactor DrivingModelFrame to remove template parameters
2026-07-26 22:32:06 +08:00 · 2025-01-06 21:37:05 +01:00 · 2025-01-06 20:02:03 +01:00 · 2025-01-06 18:46:33 +01:00 · 2025-01-06 18:46:33 +01:00 · 2025-01-06 18:46:33 +01:00
16 changed files with 354 additions and 85 deletions
@@ -212,7 +212,7 @@ std::unordered_map<std::string, uint32_t> keys = {

    // Model Manager params
    {"ModelManager_ActiveBundle", PERSISTENT},
-    {"ModelManager_DownloadIndex", CLEAR_ON_MANAGER_START | CLEAR_ON_OFFROAD_TRANSITION | CLEAR_ON_ONROAD_TRANSITION},
+    {"ModelManager_DownloadIndex", CLEAR_ON_MANAGER_START | CLEAR_ON_ONROAD_TRANSITION},
    {"ModelManager_LastSyncTime", CLEAR_ON_MANAGER_START | CLEAR_ON_OFFROAD_TRANSITION},
    {"ModelManager_ModelsCache", PERSISTENT | BACKUP},

@@ -21,7 +21,7 @@ else:
  libs += ['OpenCL']

 # Set path definitions
-for pathdef, fn in {'TRANSFORM': 'transforms/transform.cl', 'LOADYUV': 'transforms/loadyuv.cl'}.items():
+for pathdef, fn in {'TRANSFORM': 'transforms/transform.cl', 'LOADYUV': 'transforms/loadyuv.cl', 'LOADYUV_FLOAT': 'transforms/loadyuv_float.cl'}.items():
  for xenv in (lenv, lenvCython):
    xenv['CXXFLAGS'].append(f'-D{pathdef}_PATH=\\"{File(fn).abspath}\\"')

@@ -1,21 +1,13 @@
 #!/usr/bin/env python3
-import os
 from openpilot.system.hardware import TICI

+from openpilot.selfdrive.modeld.runners.model_runner import ONNXRunner, TinygradRunner
+
 #
-if TICI:
-  from tinygrad.tensor import Tensor
-  from tinygrad.dtype import dtypes
-  from openpilot.selfdrive.modeld.runners.tinygrad_helpers import qcom_tensor_from_opencl_address
-  os.environ['QCOM'] = '1'
-else:
-  from openpilot.selfdrive.modeld.runners.ort_helpers import make_onnx_cpu_runner
 import time
-import pickle
 import numpy as np
 import cereal.messaging as messaging
 from cereal import car, log
-from pathlib import Path
 from setproctitle import setproctitle
 from cereal.messaging import PubMaster, SubMaster
 from msgq.visionipc import VisionIpcClient, VisionStreamType, VisionBuf
@@ -31,15 +23,12 @@ from openpilot.selfdrive.controls.lib.desire_helper import DesireHelper
 from openpilot.selfdrive.modeld.parse_model_outputs import Parser
 from openpilot.selfdrive.modeld.fill_model_msg import fill_model_msg, fill_pose_msg, PublishState
 from openpilot.selfdrive.modeld.constants import ModelConstants
-from openpilot.selfdrive.modeld.models.commonmodel_pyx import DrivingModelFrame, CLContext
+# from openpilot.selfdrive.modeld.models.commonmodel_pyx import DrivingModelFrame, CLContext
+from openpilot.selfdrive.modeld.models.commonmodel_pyx import DrivingModelFrameLegacy as DrivingModelFrame, CLContext


 PROCESS_NAME = "selfdrive.modeld.modeld"
-SEND_RAW_PRED = os.getenv('SEND_RAW_PRED')

-MODEL_PATH = Path(__file__).parent / 'models/supercombo.onnx'
-MODEL_PKL_PATH = Path(__file__).parent / 'models/supercombo_tinygrad.pkl'
-METADATA_PATH = Path(__file__).parent / 'models/supercombo_metadata.pkl'

 class FrameMeta:
  frame_id: int = 0
@@ -61,35 +50,26 @@ class ModelState:
    self.prev_desire = np.zeros(ModelConstants.DESIRE_LEN, dtype=np.float32)
    self.full_features_20Hz = np.zeros((ModelConstants.FULL_HISTORY_BUFFER_LEN, ModelConstants.FEATURE_LEN), dtype=np.float32)
    self.desire_20Hz =  np.zeros((ModelConstants.FULL_HISTORY_BUFFER_LEN + 1, ModelConstants.DESIRE_LEN), dtype=np.float32)
+    self.is_20hz = False
+    # Initialize model runner
+    self.model_runner = TinygradRunner(self.frames) if TICI else ONNXRunner(self.frames)

    # img buffers are managed in openCL transform code
-    self.numpy_inputs = {
-      'desire': np.zeros((1, (ModelConstants.HISTORY_BUFFER_LEN+1), ModelConstants.DESIRE_LEN), dtype=np.float32),
-      'traffic_convention': np.zeros((1, ModelConstants.TRAFFIC_CONVENTION_LEN), dtype=np.float32),
-      'features_buffer': np.zeros((1, ModelConstants.HISTORY_BUFFER_LEN,  ModelConstants.FEATURE_LEN), dtype=np.float32),
-    }
+    self.numpy_inputs = {}

-    with open(METADATA_PATH, 'rb') as f:
-      model_metadata = pickle.load(f)
-    self.input_shapes =  model_metadata['input_shapes']
+    for key, shape in self.model_runner.input_shapes.items():
+      if key not in self.frames: # Managed by opencl
+        self.numpy_inputs[key] = np.zeros(shape, dtype=np.float32)

-    self.output_slices = model_metadata['output_slices']
-    net_output_size = model_metadata['output_shapes']['outputs'][1]
-    self.output = np.zeros(net_output_size, dtype=np.float32)
    self.parser = Parser()

-    if TICI:
-      self.tensor_inputs = {k: Tensor(v, device='NPY').realize() for k,v in self.numpy_inputs.items()}
-      with open(MODEL_PKL_PATH, "rb") as f:
-        self.model_run = pickle.load(f)
-    else:
-      self.onnx_cpu_runner = make_onnx_cpu_runner(MODEL_PATH)
+    net_output_size = self.model_runner.model_metadata['output_shapes']['outputs'][1]
+    self.output = np.zeros(net_output_size, dtype=np.float32)

-  def slice_outputs(self, model_outputs: np.ndarray) -> dict[str, np.ndarray]:
-    parsed_model_outputs = {k: model_outputs[np.newaxis, v] for k,v in self.output_slices.items()}
-    if SEND_RAW_PRED:
-      parsed_model_outputs['raw_pred'] = model_outputs.copy()
-    return parsed_model_outputs
+    num_elements = self.numpy_inputs['features_buffer'].shape[1]
+    step_size = int(-100 / num_elements)
+    self.full_features_20Hz_idxs = np.arange(step_size, step_size * (num_elements + 1), step_size)[::-1]
+    self.desire_reshape_dims = (self.numpy_inputs['desire'].shape[0], self.numpy_inputs['desire'].shape[1], -1, self.numpy_inputs['desire'].shape[2])

  def run(self, buf: VisionBuf, wbuf: VisionBuf, transform: np.ndarray, transform_wide: np.ndarray,
                inputs: dict[str, np.ndarray], prepare_only: bool) -> dict[str, np.ndarray] | None:
@@ -98,38 +78,54 @@ class ModelState:
    new_desire = np.where(inputs['desire'] - self.prev_desire > .99, inputs['desire'], 0)
    self.prev_desire[:] = inputs['desire']

-    self.desire_20Hz[:-1] = self.desire_20Hz[1:]
-    self.desire_20Hz[-1] = new_desire
-    self.numpy_inputs['desire'][:] = self.desire_20Hz.reshape((1,25,4,-1)).max(axis=2)
+    if self.is_20hz:
+      self.desire_20Hz[:-1] = self.desire_20Hz[1:]
+      self.desire_20Hz[-1] = new_desire
+      self.numpy_inputs['desire'][:] = self.desire_20Hz.reshape(self.desire_reshape_dims).max(axis=2)
+    else:
+      len = inputs['desire'].shape[0]
+      self.numpy_inputs['desire'][0, :-1] = self.numpy_inputs['desire'][0, 1:]
+      self.numpy_inputs['desire'][0, -1, :len] = new_desire[:len]
+
+    for key in self.numpy_inputs:
+      if key in inputs and key not in ['desire']:
+        self.numpy_inputs[key][:] = inputs[key]

-    self.numpy_inputs['traffic_convention'][:] = inputs['traffic_convention']
    imgs_cl = {'input_imgs': self.frames['input_imgs'].prepare(buf, transform.flatten()),
               'big_input_imgs': self.frames['big_input_imgs'].prepare(wbuf, transform_wide.flatten())}

-    if TICI:
-      # The imgs tensors are backed by opencl memory, only need init once
-      for key in imgs_cl:
-        if key not in self.tensor_inputs:
-          self.tensor_inputs[key] = qcom_tensor_from_opencl_address(imgs_cl[key].mem_address, self.input_shapes[key], dtype=dtypes.uint8)
-    else:
-      for key in imgs_cl:
-        self.numpy_inputs[key] = self.frames[key].buffer_from_cl(imgs_cl[key]).reshape(self.input_shapes[key])
+    # Prepare inputs using the model runner
+    self.model_runner.prepare_inputs(imgs_cl, self.numpy_inputs)

    if prepare_only:
      return None

-    if TICI:
-      self.output = self.model_run(**self.tensor_inputs).numpy().flatten()
+    # Run model inference
+    self.output = self.model_runner.run_model()
+    outputs = self.parser.parse_outputs(self.model_runner.slice_outputs(self.output), self.numpy_inputs.keys())
+
+    if self.is_20hz:
+      self.full_features_20Hz[:-1] = self.full_features_20Hz[1:]
+      self.full_features_20Hz[-1] = outputs['hidden_state'][0, :]
+      self.numpy_inputs['features_buffer'][:] = self.full_features_20Hz[self.full_features_20Hz_idxs]
    else:
-      self.output = self.onnx_cpu_runner.run(None, self.numpy_inputs)[0].flatten()
+      feature_len = outputs['hidden_state'].shape[1]
+      self.numpy_inputs['features_buffer'][0, :-1] = self.numpy_inputs['features_buffer'][0, 1:]
+      self.numpy_inputs['features_buffer'][0, -1, :feature_len] = outputs['hidden_state'][0, :feature_len]

-    outputs = self.parser.parse_outputs(self.slice_outputs(self.output))

-    self.full_features_20Hz[:-1] = self.full_features_20Hz[1:]
-    self.full_features_20Hz[-1] = outputs['hidden_state'][0, :]
+    if "desired_curvature" in outputs:
+      input_name_prev = None

-    idxs = np.arange(-4,-100,-4)[::-1]
-    self.numpy_inputs['features_buffer'][:] = self.full_features_20Hz[idxs]
+      if "prev_desired_curvs" in self.numpy_inputs.keys():
+        input_name_prev = 'prev_desired_curvs'
+      elif "prev_desired_curv" in self.numpy_inputs.keys():
+        input_name_prev = 'prev_desired_curv'
+
+      if input_name_prev is not None:
+        len = outputs['desired_curvature'][0].size
+        self.numpy_inputs[input_name_prev][0, :-len, 0] = self.numpy_inputs[input_name_prev][0, len:, 0]
+        self.numpy_inputs[input_name_prev][0, -len:, 0] = outputs['desired_curvature'][0]
    return outputs


@@ -190,7 +186,6 @@ def main(demo=False):
  meta_main = FrameMeta()
  meta_extra = FrameMeta()

-
  if demo:
    CP = get_demo_car_params()
  else:
@@ -272,6 +267,9 @@ def main(demo=False):
      'traffic_convention': traffic_convention,
      }

+    if "lateral_control_params" in model.numpy_inputs.keys():
+      inputs['lateral_control_params'] = np.array([sm["carState"].vEgo, steer_delay], dtype=np.float32)
+
    mt1 = time.perf_counter()
    model_output = model.run(buf_main, buf_extra, model_transform_main, model_transform_extra, inputs, prepare_only)
    mt2 = time.perf_counter()
@@ -8,25 +8,30 @@
 DrivingModelFrame::DrivingModelFrame(cl_device_id device_id, cl_context context) : ModelFrame(device_id, context) {
  input_frames = std::make_unique<uint8_t[]>(buf_size);
  input_frames_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, buf_size, NULL, &err));
-  img_buffer_20hz_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, 5*frame_size_bytes, NULL, &err));
-  region.origin = 4 * frame_size_bytes;
+  img_buffer_20hz_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, buffer_length*frame_size_bytes, NULL, &err));
+  region.origin = (buffer_length - 1)  * frame_size_bytes;
  region.size = frame_size_bytes;
  last_img_cl = CL_CHECK_ERR(clCreateSubBuffer(img_buffer_20hz_cl, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, &region, &err));
+  printf("Buffer length: %d, region origin: %lu, region size: %lu\n", buffer_length, region.origin, region.size);

-  loadyuv_init(&loadyuv, context, device_id, MODEL_WIDTH, MODEL_HEIGHT);
+  loadyuv_init(&loadyuv, context, device_id, MODEL_WIDTH, MODEL_HEIGHT, is_float);
  init_transform(device_id, context, MODEL_WIDTH, MODEL_HEIGHT);
 }

 cl_mem* DrivingModelFrame::prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection) {
  run_transform(yuv_cl, MODEL_WIDTH, MODEL_HEIGHT, frame_width, frame_height, frame_stride, frame_uv_offset, projection);

-  for (int i = 0; i < 4; i++) {
+  for (int i = 0; i < (buffer_length - 1); i++) {
+    printf("Moving %d to %d from src_offset %lu to dst_offset %lu with size %lu\n", i+1, i, (i+1)*frame_size_bytes, i*frame_size_bytes, frame_size_bytes);
    CL_CHECK(clEnqueueCopyBuffer(q, img_buffer_20hz_cl, img_buffer_20hz_cl, (i+1)*frame_size_bytes, i*frame_size_bytes, frame_size_bytes, 0, nullptr, nullptr));
  }
+  printf("Loop done\n");
  loadyuv_queue(&loadyuv, q, y_cl, u_cl, v_cl, last_img_cl);

  copy_queue(&loadyuv, q, img_buffer_20hz_cl, input_frames_cl, 0, 0, frame_size_bytes);
+  printf("Copying from %p to %p with size %lu\n", img_buffer_20hz_cl, input_frames_cl, frame_size_bytes);
  copy_queue(&loadyuv, q, last_img_cl, input_frames_cl, 0, frame_size_bytes, frame_size_bytes);
+  printf("Copying from %p to %p with size %lu\n", last_img_cl, input_frames_cl, frame_size_bytes);

  // NOTE: Since thneed is using a different command queue, this clFinish is needed to ensure the image is ready.
  clFinish(q);
@@ -41,6 +46,45 @@ DrivingModelFrame::~DrivingModelFrame() {
  CL_CHECK(clReleaseCommandQueue(q));
 }

+DrivingModelFrameLegacy::DrivingModelFrameLegacy(cl_device_id device_id, cl_context context): ModelFrame(device_id, context) {
+  input_frames = std::make_unique<float[]>(buf_size);
+  input_frames_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, buf_size * sizeof(float), NULL, &err));
+  // Reduce buffer to 2 frames due to float size (4x bigger than uint8_t)
+  img_buffer_20hz_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, 2 * frame_size_bytes * sizeof(float), NULL, &err));
+    
+  // Adjust region for 2-frame buffer
+  region.origin = frame_size_bytes * sizeof(float);  // Point to second frame
+  region.size = frame_size_bytes * sizeof(float);
+  last_img_cl = CL_CHECK_ERR(clCreateSubBuffer(img_buffer_20hz_cl, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, &region, &err));
+
+  loadyuv_init(&loadyuv, context, device_id, MODEL_WIDTH, MODEL_HEIGHT, true);
+  init_transform(device_id, context, MODEL_WIDTH, MODEL_HEIGHT);
+}
+
+cl_mem* DrivingModelFrameLegacy::prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset,
+  const mat3 &projection) {
+  run_transform(yuv_cl, MODEL_WIDTH, MODEL_HEIGHT, frame_width, frame_height, frame_stride, frame_uv_offset, projection);
+
+  // Rolling buffer with just 2 frames
+  CL_CHECK(clEnqueueCopyBuffer(q, img_buffer_20hz_cl, img_buffer_20hz_cl, frame_size_bytes * sizeof(float), 0, frame_size_bytes * sizeof(float), 0, nullptr, nullptr));
+    
+  loadyuv_queue(&loadyuv, q, y_cl, u_cl, v_cl, last_img_cl);
+
+  // Copy both frames to input buffer
+  copy_queue(&loadyuv, q, img_buffer_20hz_cl, input_frames_cl, 0, 0, frame_size_bytes * sizeof(float));
+  copy_queue(&loadyuv, q, last_img_cl, input_frames_cl, 0, frame_size_bytes * sizeof(float), frame_size_bytes * sizeof(float));
+
+  clFinish(q);
+  return &input_frames_cl;
+}
+
+DrivingModelFrameLegacy::~DrivingModelFrameLegacy() {
+  deinit_transform();
+  loadyuv_destroy(&loadyuv);
+  CL_CHECK(clReleaseMemObject(img_buffer_20hz_cl));
+  CL_CHECK(clReleaseMemObject(input_frames_cl));
+  CL_CHECK(clReleaseCommandQueue(q));
+}

 MonitoringModelFrame::MonitoringModelFrame(cl_device_id device_id, cl_context context) : ModelFrame(device_id, context) {
  input_frames = std::make_unique<uint8_t[]>(buf_size);
@@ -17,6 +17,7 @@
 #include "selfdrive/modeld/transforms/loadyuv.h"
 #include "selfdrive/modeld/transforms/transform.h"

+template <typename T = uint8_t>
 class ModelFrame {
 public:
  ModelFrame(cl_device_id device_id, cl_context context) {
@@ -24,7 +25,7 @@ public:
  }
  virtual ~ModelFrame() {}
  virtual cl_mem* prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection) { return NULL; }
-  uint8_t* buffer_from_cl(cl_mem *in_frames, int buffer_size) {
+  T* buffer_from_cl(cl_mem *in_frames, int buffer_size) {
    CL_CHECK(clEnqueueReadBuffer(q, *in_frames, CL_TRUE, 0, buffer_size, input_frames.get(), 0, nullptr, nullptr));
    clFinish(q);
    return &input_frames[0];
@@ -39,7 +40,7 @@ protected:
  cl_mem y_cl, u_cl, v_cl;
  Transform transform;
  cl_command_queue q;
-  std::unique_ptr<uint8_t[]> input_frames;
+  std::unique_ptr<T[]> input_frames;

  void init_transform(cl_device_id device_id, cl_context context, int model_width, int model_height) {
    y_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, model_width * model_height, NULL, &err));
@@ -62,7 +63,11 @@ protected:
  }
 };

-class DrivingModelFrame : public ModelFrame {
+class DrivingModelFrame : public ModelFrame<> {
+  using ModelFrame::q, ModelFrame::y_cl, ModelFrame::u_cl, ModelFrame::v_cl;
+  using ModelFrame::init_transform, ModelFrame::deinit_transform, ModelFrame::run_transform;
+  using ModelFrame::input_frames;
+  
 public:
  DrivingModelFrame(cl_device_id device_id, cl_context context);
  ~DrivingModelFrame();
@@ -73,6 +78,8 @@ public:
  const int MODEL_FRAME_SIZE = MODEL_WIDTH * MODEL_HEIGHT * 3 / 2;
  const int buf_size = MODEL_FRAME_SIZE * 2;
  const size_t frame_size_bytes = MODEL_FRAME_SIZE * sizeof(uint8_t);
+  const bool is_float = false;
+  const uint8_t buffer_length = is_float ? 2 : 5;

 private:
  LoadYUVState loadyuv;
@@ -80,7 +87,7 @@ private:
  cl_buffer_region region;
 };

-class MonitoringModelFrame : public ModelFrame {
+class MonitoringModelFrame : public ModelFrame<> {
 public:
  MonitoringModelFrame(cl_device_id device_id, cl_context context);
  ~MonitoringModelFrame();
@@ -94,3 +101,23 @@ public:
 private:
  cl_mem input_frame_cl;
 };
+
+class DrivingModelFrameLegacy : public ModelFrame<float> {
+public:
+  DrivingModelFrameLegacy(cl_device_id device_id, cl_context context);
+  ~DrivingModelFrameLegacy();
+  cl_mem* prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection);
+
+  const int MODEL_WIDTH = 512;
+  const int MODEL_HEIGHT = 256;
+  const int MODEL_FRAME_SIZE = MODEL_WIDTH * MODEL_HEIGHT * 3 / 2;
+  const int buf_size = MODEL_FRAME_SIZE * 2;
+
+private:
+  cl_mem input_frames_cl;
+  cl_mem img_buffer_20hz_cl;
+  cl_mem last_img_cl;
+  LoadYUVState loadyuv;
+  cl_buffer_region region;
+  const size_t frame_size_bytes = MODEL_FRAME_SIZE * sizeof(float);  // This is the only change
+};
@@ -12,15 +12,19 @@ cdef extern from "common/clutil.h":
  cl_context cl_create_context(cl_device_id)

 cdef extern from "selfdrive/modeld/models/commonmodel.h":
-  cppclass ModelFrame:
+  cppclass ModelFrame[T]:
    int buf_size
-    unsigned char * buffer_from_cl(cl_mem*, int);
+    T * buffer_from_cl(cl_mem*, int);
    cl_mem * prepare(cl_mem, int, int, int, int, mat3)

  cppclass DrivingModelFrame:
    int buf_size
    DrivingModelFrame(cl_device_id, cl_context)

+  cppclass DrivingModelFrameLegacy:
+    int buf_size
+    DrivingModelFrameLegacy(cl_device_id, cl_context)
+
  cppclass MonitoringModelFrame:
    int buf_size
    MonitoringModelFrame(cl_device_id, cl_context)
@@ -4,12 +4,12 @@
 import numpy as np
 cimport numpy as cnp
 from libc.string cimport memcpy
-from libc.stdint cimport uintptr_t
+from libc.stdint cimport uintptr_t, uint8_t

 from msgq.visionipc.visionipc cimport cl_mem
 from msgq.visionipc.visionipc_pyx cimport VisionBuf, CLContext as BaseCLContext
 from .commonmodel cimport CL_DEVICE_TYPE_DEFAULT, cl_get_device_id, cl_create_context
-from .commonmodel cimport mat3, ModelFrame as cppModelFrame, DrivingModelFrame as cppDrivingModelFrame, MonitoringModelFrame as cppMonitoringModelFrame
+from .commonmodel cimport mat3, ModelFrame as cppModelFrame, DrivingModelFrame as cppDrivingModelFrame, DrivingModelFrameLegacy as cppDrivingModelFrameLegacy, MonitoringModelFrame as cppMonitoringModelFrame


 cdef class CLContext(BaseCLContext):
@@ -33,7 +33,7 @@ def cl_from_visionbuf(VisionBuf buf):


 cdef class ModelFrame:
-  cdef cppModelFrame * frame
+  cdef cppModelFrame[uint8_t] * frame
  cdef int buf_size

  def __dealloc__(self):
@@ -52,12 +52,40 @@ cdef class ModelFrame:
    return np.asarray(<cnp.uint8_t[:self.buf_size]> data2)


+cdef class ModelFrame_float:
+  cdef cppModelFrame[float] * frame
+  cdef int buf_size
+
+  def __dealloc__(self):
+    del self.frame
+
+  def prepare(self, VisionBuf buf, float[:] projection):
+    cdef mat3 cprojection
+    memcpy(cprojection.v, &projection[0], 9*sizeof(float))
+    cdef cl_mem * data
+    data = self.frame.prepare(buf.buf.buf_cl, buf.width, buf.height, buf.stride, buf.uv_offset, cprojection)
+    return CLMem.create(data)
+
+  def buffer_from_cl(self, CLMem in_frames):
+    cdef float * data2
+    data2 = self.frame.buffer_from_cl(in_frames.mem, self.buf_size)
+    return np.asarray(<cnp.float32_t[:self.buf_size]> data2)
+
+
 cdef class DrivingModelFrame(ModelFrame):
  cdef cppDrivingModelFrame * _frame

  def __cinit__(self, CLContext context):
    self._frame = new cppDrivingModelFrame(context.device_id, context.context)
-    self.frame = <cppModelFrame*>(self._frame)
+    self.frame = <cppModelFrame[uint8_t]*>(self._frame)
+    self.buf_size = self._frame.buf_size
+
+cdef class DrivingModelFrameLegacy(ModelFrame_float):
+  cdef cppDrivingModelFrameLegacy * _frame
+
+  def __cinit__(self, CLContext context):
+    self._frame = new cppDrivingModelFrameLegacy(context.device_id, context.context)
+    self.frame = <cppModelFrame[float]*>(self._frame)
    self.buf_size = self._frame.buf_size

 cdef class MonitoringModelFrame(ModelFrame):
@@ -65,6 +93,6 @@ cdef class MonitoringModelFrame(ModelFrame):

  def __cinit__(self, CLContext context):
    self._frame = new cppMonitoringModelFrame(context.device_id, context.context)
-    self.frame = <cppModelFrame*>(self._frame)
+    self.frame = <cppModelFrame[uint8_t]*>(self._frame)
    self.buf_size = self._frame.buf_size

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:72d3d6f8d3c98f5431ec86be77b6350d7d4f43c25075c0106f1d1e7ec7c77668
-size 49096168
+oid sha256:b31b504bc0b440d3bc72967507a00eb4f112285626fbfb3135011500325ee6d6
+size 51452435
@@ -84,7 +84,8 @@ class Parser:
    outs[name] = pred_mu_final.reshape(final_shape)
    outs[name + '_stds'] = pred_std_final.reshape(final_shape)

-  def parse_outputs(self, outs: dict[str, np.ndarray]) -> dict[str, np.ndarray]:
+  def parse_outputs(self, outs: dict[str, np.ndarray], input_keys: [str]) -> dict[str, np.ndarray]:
+    """ Parse the model outputs into a dictionary of numpy arrays. The input_keys are used to determine how the output should be parsed. """
    self.parse_mdn('plan', outs, in_N=ModelConstants.PLAN_MHP_N, out_N=ModelConstants.PLAN_MHP_SELECTION,
                   out_shape=(ModelConstants.IDX_N,ModelConstants.PLAN_WIDTH))
    self.parse_mdn('lane_lines', outs, in_N=0, out_N=0, out_shape=(ModelConstants.NUM_LANE_LINES,ModelConstants.IDX_N,ModelConstants.LANE_LINES_WIDTH))
@@ -96,6 +97,8 @@ class Parser:
                   out_shape=(ModelConstants.LEAD_TRAJ_LEN,ModelConstants.LEAD_WIDTH))
    if 'lat_planner_solution' in outs:
      self.parse_mdn('lat_planner_solution', outs, in_N=0, out_N=0, out_shape=(ModelConstants.IDX_N,ModelConstants.LAT_PLANNER_SOLUTION_WIDTH))
+    if 'desired_curvature' in outs and "prev_desired_curv" in input_keys:
+      self.parse_mdn('desired_curvature', outs, in_N=0, out_N=0, out_shape=(ModelConstants.DESIRED_CURV_WIDTH,))
    for k in ['lead_prob', 'lane_lines_prob', 'meta']:
      self.parse_binary_crossentropy(k, outs)
    self.parse_categorical_crossentropy('desire_state', outs, out_shape=(ModelConstants.DESIRE_PRED_WIDTH,))
@@ -0,0 +1,115 @@
+import os
+from openpilot.system.hardware import TICI
+
+#
+from tinygrad.tensor import Tensor, dtypes
+from openpilot.selfdrive.modeld.runners.tinygrad_helpers import qcom_tensor_from_opencl_address
+from openpilot.selfdrive.modeld.runners.ort_helpers import make_onnx_cpu_runner, ORT_TYPES_TO_NP_TYPES
+import pickle
+import numpy as np
+from pathlib import Path
+from abc import ABC, abstractmethod
+# from openpilot.selfdrive.modeld.models.commonmodel_pyx import DrivingModelFrame, CLMem
+from openpilot.selfdrive.modeld.models.commonmodel_pyx import DrivingModelFrameLegacy as DrivingModelFrame, CLMem
+
+if TICI:
+  os.environ['QCOM'] = '1'
+
+SEND_RAW_PRED = os.getenv('SEND_RAW_PRED')
+MODEL_PATH = Path(__file__).parent / '../models/supercombo.onnx'
+MODEL_PKL_PATH = Path(__file__).parent / '../models/supercombo_tinygrad.pkl'
+METADATA_PATH = Path(__file__).parent / '../models/supercombo_metadata.pkl'
+
+
+class ModelRunner(ABC):
+  """Abstract base class for model runners that defines the interface for running ML models."""
+
+  def __init__(self):
+    """Initialize the model runner with paths to model and metadata files."""
+    with open(METADATA_PATH, 'rb') as f:
+      self.model_metadata = pickle.load(f)
+    self.input_shapes = self.model_metadata['input_shapes']
+    self.output_slices = self.model_metadata['output_slices']
+    self.inputs: dict = {}
+
+  @abstractmethod
+  def prepare_inputs(self, imgs_cl: dict[str, CLMem], numpy_inputs: dict[str, np.ndarray])-> dict:
+    """Prepare inputs for model inference."""
+
+  @abstractmethod
+  def run_model(self):
+    """Run model inference with prepared inputs."""
+
+  def slice_outputs(self, model_outputs: np.ndarray) -> dict:
+    """Slice model outputs according to metadata configuration."""
+    parsed_outputs = {k: model_outputs[np.newaxis, v] for k, v in self.output_slices.items()}
+    if SEND_RAW_PRED:
+      parsed_outputs['raw_pred'] = model_outputs.copy()
+    return parsed_outputs
+
+
+class TinygradRunner(ModelRunner):
+  """Tinygrad implementation of model runner for TICI hardware."""
+
+  def __init__(self, frames: dict[str, DrivingModelFrame] | None = None):
+    super().__init__()
+    # Load Tinygrad model
+    with open(MODEL_PKL_PATH, "rb") as f:
+      self.model_run = pickle.load(f)
+
+    self.input_to_dtype = {}
+    self.input_to_device = {}
+
+    for idx, name in enumerate(self.model_run.captured.expected_names):
+      self.input_to_dtype[name] = self.model_run.captured.expected_st_vars_dtype_device[idx][2]  # 2 is the dtype
+      self.input_to_device[name] = self.model_run.captured.expected_st_vars_dtype_device[idx][3]  # 3 is the device
+
+    assert TICI or frames is not None, "TinygradRunner requires frames for non-TICI hardware"
+    self.frames = frames
+    self.is_memory_model = None  # Use None to indicate that it hasn't been determined yet
+
+  def prepare_inputs(self, imgs_cl: dict[str, CLMem], numpy_inputs: dict[str, np.ndarray]) -> dict:
+    if self.is_memory_model is None:
+      self.is_memory_model = any(self.input_to_dtype[key] == dtypes.uint8 for key in imgs_cl)
+      print(f"Memory model: {self.is_memory_model}")
+
+    # Initialize image tensors if not already done
+    for key in imgs_cl:
+      if TICI and self.is_memory_model and key not in self.inputs:
+        self.inputs[key] = qcom_tensor_from_opencl_address(imgs_cl[key].mem_address, self.input_shapes[key], dtype=dtypes.uint8)
+      elif not TICI or not self.is_memory_model:
+        shape = self.frames[key].buffer_from_cl(imgs_cl[key]).reshape(self.input_shapes[key])
+        self.inputs[key] = Tensor(shape, device=self.input_to_device[key], dtype=self.input_to_dtype[key]).realize()
+
+    # Update numpy inputs
+    for key, value in numpy_inputs.items():
+      if key not in imgs_cl:
+        self.inputs[key] = Tensor(value, device=self.input_to_device[key], dtype=self.input_to_dtype[key]).realize()
+
+    return self.inputs
+
+  def run_model(self):
+    return self.model_run(**self.inputs).numpy().flatten()
+
+
+class ONNXRunner(ModelRunner):
+  """ONNX implementation of model runner for non-TICI hardware."""
+
+  def __init__(self, frames: dict[str, DrivingModelFrame]):
+    super().__init__()
+    self.runner = make_onnx_cpu_runner(MODEL_PATH)
+    self.frames = frames
+
+    self.input_to_nptype = {
+      model_input.name: ORT_TYPES_TO_NP_TYPES[model_input.type]
+      for model_input in self.runner.get_inputs()
+    }
+
+  def prepare_inputs(self, imgs_cl: dict[str, CLMem], numpy_inputs: dict[str, np.ndarray]) -> dict:
+    self.inputs = numpy_inputs.copy()
+    for key in imgs_cl:
+      self.inputs[key] = self.frames[key].buffer_from_cl(imgs_cl[key]).reshape(self.input_shapes[key])
+    return self.inputs
+
+  def run_model(self):
+    return self.runner.run(None, self.inputs)[0].flatten()
@@ -4,7 +4,7 @@
 #include <cstdio>
 #include <cstring>

-void loadyuv_init(LoadYUVState* s, cl_context ctx, cl_device_id device_id, int width, int height) {
+void loadyuv_init(LoadYUVState* s, cl_context ctx, cl_device_id device_id, int width, int height, bool use_float) {
  memset(s, 0, sizeof(*s));

  s->width = width;
@@ -15,7 +15,9 @@ void loadyuv_init(LoadYUVState* s, cl_context ctx, cl_device_id device_id, int w
           "-cl-fast-relaxed-math -cl-denorms-are-zero "
           "-DTRANSFORMED_WIDTH=%d -DTRANSFORMED_HEIGHT=%d",
           width, height);
-  cl_program prg = cl_program_from_file(ctx, device_id, LOADYUV_PATH, args);
+  const char * loadyuv_path = use_float ? LOADYUV_FLOAT_PATH : LOADYUV_PATH;
+  printf(" Use float: %d\n Using loadyuv_path: %s\n", use_float, loadyuv_path);
+  cl_program prg = cl_program_from_file(ctx, device_id, loadyuv_path, args);

  s->loadys_krnl = CL_CHECK_ERR(clCreateKernel(prg, "loadys", &err));
  s->loaduv_krnl = CL_CHECK_ERR(clCreateKernel(prg, "loaduv", &err));
@@ -7,7 +7,8 @@ typedef struct {
  cl_kernel loadys_krnl, loaduv_krnl, copy_krnl;
 } LoadYUVState;

-void loadyuv_init(LoadYUVState* s, cl_context ctx, cl_device_id device_id, int width, int height);
+void loadyuv_init(LoadYUVState* s, cl_context ctx, cl_device_id device_id, int width, int height, bool use_float);
+inline void loadyuv_init(LoadYUVState* s, cl_context ctx, cl_device_id device_id, int width, int height) { loadyuv_init(s, ctx, device_id, width, height, false); };

 void loadyuv_destroy(LoadYUVState* s);

@@ -0,0 +1,49 @@
+#define UV_SIZE ((TRANSFORMED_WIDTH/2)*(TRANSFORMED_HEIGHT/2))
+
+__kernel void loadys(__global uchar8 const * const Y,
+                     __global float * out,
+                     int out_offset)
+{
+    const int gid = get_global_id(0);
+    const int ois = gid * 8;
+    const int oy = ois / TRANSFORMED_WIDTH;
+    const int ox = ois % TRANSFORMED_WIDTH;
+
+    const uchar8 ys = Y[gid];
+    const float8 ysf = convert_float8(ys);
+
+    // 02
+    // 13
+
+    __global float* outy0;
+    __global float* outy1;
+    if ((oy & 1) == 0) {
+      outy0 = out + out_offset; //y0
+      outy1 = out + out_offset + UV_SIZE*2; //y2
+    } else {
+      outy0 = out + out_offset + UV_SIZE; //y1
+      outy1 = out + out_offset + UV_SIZE*3; //y3
+    }
+
+    vstore4(ysf.s0246, 0, outy0 + (oy/2) * (TRANSFORMED_WIDTH/2) + ox/2);
+    vstore4(ysf.s1357, 0, outy1 + (oy/2) * (TRANSFORMED_WIDTH/2) + ox/2);
+}
+
+__kernel void loaduv(__global uchar8 const * const in,
+                     __global float8 * out,
+                     int out_offset)
+{
+  const int gid = get_global_id(0);
+  const uchar8 inv = in[gid];
+  const float8 outv  = convert_float8(inv);
+  out[gid + out_offset / 8] = outv;
+}
+
+__kernel void copy(__global float8 * in,
+                   __global float8 * out,
+                   int in_offset,
+                   int out_offset)
+{
+  const int gid = get_global_id(0);
+  out[gid + out_offset / 8] = in[gid + in_offset / 8];
+}
@@ -83,8 +83,6 @@ void SoftwarePanelSP::handleBundleDownloadProgress() {
  if (bundle.getStatus() == cereal::ModelManagerSP::DownloadStatus::DOWNLOADING) {
    currentModelLblBtn->showDescription();
  }
-
-  currentModelLblBtn->setEnabled(!is_onroad && !isDownloading());
 }

 /**
@@ -128,7 +126,6 @@ void SoftwarePanelSP::handleCurrentModelLblBtnClicked() {
    bundleNames.append(index_to_bundle[index]);
  }

-  currentModelLblBtn->setEnabled(!is_onroad);
  currentModelLblBtn->setValue(GetActiveModelName());

  const QString selectedBundleName = MultiOptionDialog::getSelection(
@@ -161,6 +158,7 @@ void SoftwarePanelSP::updateLabels() {
  }

  handleBundleDownloadProgress();
+  currentModelLblBtn->setEnabled(!is_onroad && !isDownloading());
  currentModelLblBtn->setValue(GetActiveModelName());
  SoftwarePanel::updateLabels();
 }
@@ -24,7 +24,7 @@ private:
    const SubMaster &sm = *(uiStateSP()->sm);
    const auto model_manager = sm["modelManagerSP"].getModelManagerSP();

-    if (!model_manager.hasSelectedBundle()) {
+    if (!model_manager.hasSelectedBundle() || !sm.updated("modelManagerSP")) {
      return false;
    }
Author	SHA1	Message	Date
DevTekVE	e28a7a5bee	Refactor modeld for flexible input rates and output parsing. Add support for both 20Hz and variable input rates in `modeld.py` by introducing conditional data updates based on a new `is_20hz` flag. Additionally, enhance `parse_outputs` in `parse_model_outputs.py` to handle `desired_curvature` parsing when relevant input keys are present.	2025-01-06 21:37:05 +01:00
DevTekVE	5e889d9ec6	Refactor DrivingModelFrame to remove template parameters Transitioned DrivingModelFrame from a templated implementation to a single class structure, simplifying type usage and memory management across the codebase. Adjusted related buffer sizes and usage to accommodate these changes, improving maintainability and clarity. Legacy and float-specific implementations remain unchanged.	2025-01-06 20:02:03 +01:00
DevTekVE	5ebd6746d1	Testing diff approach	2025-01-06 18:46:33 +01:00
DevTekVE	9e7203567e	Increase buffer size for image processing in DrivingModelFrameLegacy Doubled the buffer size allocated for `img_buffer_cl` to accommodate larger frames and updated the copy operation accordingly. These changes ensure proper handling of image data and improve robustness in frame processing.	2025-01-06 18:46:33 +01:00
DevTekVE	aba1f4b5fc	kinda works	2025-01-06 18:46:33 +01:00
DevTekVE	5a3ee2d376	Clean	2025-01-06 18:46:33 +01:00
DevTekVE	ae5a2430d5	Introduce DrivingModelFrameLegacy for float-based model handling Added DrivingModelFrameLegacy class to handle float-based inputs, replacing the previous DrivingModelFrame_float. This includes the implementation of initialization, preparation, and teardown methods. Updated imports and references across the codebase to reflect the new class.	2025-01-06 18:46:33 +01:00
DevTekVE	6e56f37823	force	2025-01-06 18:46:33 +01:00
DevTekVE	d77d0cf622	needs	2025-01-06 18:46:32 +01:00
DevTekVE	49888d2b7c	Refactor model frame handling to support flexible data types Updated `ModelFrame` and its derived classes to use templated types, enabling support for multiple data types like `float` and `uint8_t`. Introduced a separate OpenCL kernel for `float` processing and adjusted related methods across headers, Pyx, and C++ files to accommodate this refactor. This enables more versatile data representation for improved model operations.	2025-01-06 18:46:32 +01:00
DevTekVE	efc1f6378e	Update input handling logic in modeld.py Refactored the assignment to `self.numpy_inputs` to ensure only specific keys from `inputs` are copied, excluding 'desire'. This improves accuracy and prevents unintended data overwrites in the model pipeline.	2025-01-06 18:46:32 +01:00
DevTekVE	b0fef4b5ea	Add lateral_control_params to model inputs if available This change introduces an additional check for "lateral_control_params" in model inputs. If present, it adds the lateral control parameters to the input dictionary, including vehicle speed and steering delay. This ensures compatibility with models requiring these parameters.	2025-01-06 18:46:32 +01:00
DevTekVE	e6b9497099	Refactor curvature handling in modeld input processing Simplified and unified the handling of "desired_curvature" by removing redundant code and centralizing logic. Improved clarity and maintainability by determining the appropriate input key dynamically and implementing consistent array updates.	2025-01-06 18:46:32 +01:00
DevTekVE	18bc3c5920	Refactor model runner initialization and input handling. Added a memory model detection mechanism in `TinygradRunner` and adjusted input preparation logic based on the detected model type. Fixed the `TinygradRunner` constructor to correctly pass `frames` for TICI hardware, ensuring proper model initialization.	2025-01-06 18:46:32 +01:00
DevTekVE	cfffc94a07	Refine input handling logic for non-TICI devices. This change adds an explicit check for non-TICI devices in the input processing logic, ensuring correctness and preventing unnecessary operations. The updated logic improves maintainability and clarity in device-specific handling.	2025-01-06 18:46:31 +01:00
DevTekVE	f72447245c	Refactor TinygradRunner to support non-TICI hardware Adjust TinygradRunner to handle frame data on non-TICI hardware by leveraging device and dtype mappings. Simplify input preparation logic for better compatibility and ensure proper assertion for required frames. This refactor improves flexibility and maintains existing functionality for TICI.	2025-01-06 18:46:31 +01:00
DevTekVE	c959947b8f	[WIP] Adding support for NDv2	2025-01-06 18:46:31 +01:00
DevTekVE	c9a23395ea	Refactor model runner and input initialization logic. Simplified and improved input handling for both `TinygradRunner` and `ONNXRunner`, introducing better mappings for data types and shapes. Updated buffer initialization, reshaping logic, and index calculations to enhance readability and maintainability. This ensures consistency across the model runner's input/output processing.	2025-01-06 18:46:31 +01:00
DevTekVE	50382d7c01	Refactor type annotations and return types in model_runner Removed specific type hints and return annotations for `self.inputs` and `run_model` methods to enhance flexibility and maintain consistency. These changes streamline the code and improve compatibility with varying input/output types during model inference.	2025-01-06 18:46:31 +01:00
DevTekVE	68b240d09b	Refactor type hints for improved code consistency. Replaced specific type hints like `dict[str, np.ndarray]` with generic `dict` in several method signatures to simplify annotations. This improves overall readability and maintains functionality, while aligning with existing code practices.	2025-01-06 18:46:31 +01:00
DevTekVE	556c72cb90	Clean	2025-01-06 18:46:31 +01:00
DevTekVE	c050fa7c49	Add type annotations to inputs and prepare_inputs method This commit adds explicit type annotations for the `inputs` dictionary and the return value of the `prepare_inputs` method in `model_runner.py`. These changes improve code readability and ensure type consistency, enhancing maintainability and reducing potential errors.	2025-01-06 18:46:31 +01:00
DevTekVE	0784570ead	Rename TinyGradRunner to TinygradRunner for consistency. The class name and references were updated to maintain naming consistency across the codebase. This aligns with naming conventions and improves code clarity. No functional changes were introduced.	2025-01-06 18:46:30 +01:00
DevTekVE	f67e8aca47	Refactor `prepare_inputs` to use explicit `CLMem` type. This update replaces the generic `any` type with the more explicit `CLMem` type for better type safety and clarity. It ensures consistency across the `prepare_inputs` method implementations in derived classes, improving code readability and robustness.	2025-01-06 18:46:30 +01:00
DevTekVE	5d1b403015	Refactor model inference to use internal state for inputs Simplified the `run_model` method by removing the requirement to pass inputs as arguments, and instead leveraging an internal `inputs` state. Adjusted `prepare_inputs` methods across model runners to populate this internal state. This refactor improves code clarity and reduces redundancy in managing input data.	2025-01-06 18:46:30 +01:00
DevTekVE	c59dead9fd	Fix model_runner output to ensure tensor conversion to NumPy. Updated the `run_model` method to explicitly convert tensor outputs to NumPy arrays using `.numpy()`. This ensures compatibility with downstream processes relying on NumPy array inputs.	2025-01-06 18:46:30 +01:00
DevTekVE	8512406fa9	Remove unused imports and redundant pass statements. This change cleans up the code by removing unused imports and redundant `pass` statements in abstract methods. It improves code readability and adheres to cleaner coding practices.	2025-01-06 18:46:30 +01:00
DevTekVE	7087f3f400	Remove redundant tensor assignment in model runner. The `assign` operation was unnecessary as new tensors are realized for updated inputs. This simplifies the code and avoids redundant updates, improving clarity and maintainability.	2025-01-06 18:46:30 +01:00
DevTekVE	b7e601e92d	Refactor model runner initialization logic. Removed the `create_model_runner` factory function and replaced it with direct initialization of `TinyGradRunner` or `ONNXRunner`. Simplified the `__init__` methods by standardizing paths as constants within `model_runner.py` for cleaner and more maintainable code.	2025-01-06 18:46:30 +01:00
DevTekVE	c3f92f2783	Refactor formatting in model_runner.py for readability. Consolidated multiline function declarations and calls into single lines where appropriate to improve code readability and maintainability. No changes were made to the functionality.	2025-01-06 18:46:29 +01:00
DevTekVE	5a3e80fc92	Refactoring model handling in modeld.py with ModelRunner abstraction A significant refactoring of `modeld.py` was performed to enhance the handling of model logic. A new abstraction called `ModelRunner` has been introduced which encapsulates the model-running logic. This refactor simplifies the `modeld.py` script and provides easier management across different hardware configurations. Using this segregation, varying processing methods for models can be handled distinctly ensuring cleaner and more maintainable code. An instance of the appropriate model runner is now created during initialization based on whether a TICI hardware or a different type is used.	2025-01-06 18:46:29 +01:00
DevTekVE	5fac11efbb	ui: prevent driving model change with offroad transition (#524 ) * Update model manager logic and handle offroad transitions Added is_onroad state tracking in SoftwarePanelSP to handle offroad transitions. Updated model manager conditions for improved bundle validation. Removed unnecessary clear operation for ModelManager_DownloadIndex during offroad transitions to optimize behavior. * Using is_onroad softwarePanel * Enable model label button only when conditions are met Previously, the button's state update was misplaced, leading to potential issues with its interactive availability. The logic has been adjusted to ensure it is properly enabled or disabled based on onroad status and download progress. This change improves UX consistency and prevents unintended actions. * Remove redundant setEnabled call for currentModelLblBtn The setEnabled call was unnecessary as its functionality was not required in this context. Cleaning up this code improves readability and removes redundant operations. No changes to functionality or behavior were introduced. --------- Co-authored-by: Jason Wen <haibin.wen3@gmail.com>	2025-01-06 18:46:29 +01:00