Compare commits

...

7 Commits

Author SHA1 Message Date
DevTekVE
6e5b17a660 Cleaning up 2025-01-06 22:40:03 +01:00
DevTekVE
8f56d4d80b Bringing what was on master back then 2025-01-06 22:29:55 +01:00
Jason Wen
265a1a0ada old desired_curv data: MLSIM V1 to Postal Service 2025-01-06 15:35:47 -05:00
Jason Wen
0fca387821 lateral_control_params & prev_desired_curv: MLSIM V0 to Null Pointer 2025-01-06 15:34:37 -05:00
Jason Wen
d957a92fbf cherry pick from devtekve as base 2025-01-06 13:54:32 -05:00
Jason Wen
5809ab3baa load model and metadata dynamically 2025-01-06 00:46:45 -05:00
Jason Wen
570789a179 parse inputs via metadata 2025-01-06 00:09:52 -05:00
19 changed files with 137 additions and 358 deletions

View File

@@ -15,8 +15,7 @@ class ModelConstants:
# model inputs constants
MODEL_FREQ = 20
FEATURE_LEN = 512
FULL_HISTORY_BUFFER_LEN = 99
HISTORY_BUFFER_LEN = 24
HISTORY_BUFFER_LEN = 99
DESIRE_LEN = 8
TRAFFIC_CONVENTION_LEN = 2
LAT_PLANNER_STATE_LEN = 4

View File

@@ -3,22 +3,11 @@ import capnp
import numpy as np
from cereal import log
from openpilot.sunnypilot.modeld.constants import ModelConstants, Plan, Meta
from openpilot.selfdrive.controls.lib.drive_helpers import MIN_SPEED
SEND_RAW_PRED = os.getenv('SEND_RAW_PRED')
ConfidenceClass = log.ModelDataV2.ConfidenceClass
def curv_from_psis(psi_target, psi_rate, vego, delay):
vego = np.clip(vego, MIN_SPEED, np.inf)
curv_from_psi = psi_target / (vego * delay) # epsilon to prevent divide-by-zero
return 2*curv_from_psi - psi_rate / vego
def get_curvature_from_plan(plan, vego, delay):
psi_target = np.interp(delay, ModelConstants.T_IDXS, plan[:, Plan.T_FROM_CURRENT_EULER][:, 2])
psi_rate = plan[:, Plan.ORIENTATION_RATE][0, 2]
return curv_from_psis(psi_target, psi_rate, vego, delay)
class PublishState:
def __init__(self):
self.disengage_buffer = np.zeros(ModelConstants.CONFIDENCE_BUFFER_LEN*ModelConstants.DISENGAGE_WIDTH, dtype=np.float32)
@@ -59,24 +48,15 @@ def fill_xyz_poly(builder, degree, x, y, z):
builder.yCoefficients = coeffs[:, 1].tolist()
builder.zCoefficients = coeffs[:, 2].tolist()
def fill_lane_line_meta(builder, lane_lines, lane_line_probs):
builder.leftY = lane_lines[1].y[0]
builder.leftProb = lane_line_probs[1]
builder.rightY = lane_lines[2].y[0]
builder.rightProb = lane_line_probs[2]
def fill_model_msg(base_msg: capnp._DynamicStructBuilder, extended_msg: capnp._DynamicStructBuilder,
net_output_data: dict[str, np.ndarray], v_ego: float, delay: float,
publish_state: PublishState, vipc_frame_id: int, vipc_frame_id_extra: int,
frame_id: int, frame_drop: float, timestamp_eof: int, model_execution_time: float,
valid: bool) -> None:
net_output_data: dict[str, np.ndarray], publish_state: PublishState,
vipc_frame_id: int, vipc_frame_id_extra: int, frame_id: int, frame_drop: float,
timestamp_eof: int, model_execution_time: float, valid: bool) -> None:
frame_age = frame_id - vipc_frame_id if frame_id > vipc_frame_id else 0
frame_drop_perc = frame_drop * 100
extended_msg.valid = valid
base_msg.valid = valid
desired_curv = float(get_curvature_from_plan(net_output_data['plan'][0], v_ego, delay))
driving_model_data = base_msg.drivingModelData
driving_model_data.frameId = vipc_frame_id
@@ -85,7 +65,7 @@ def fill_model_msg(base_msg: capnp._DynamicStructBuilder, extended_msg: capnp._D
driving_model_data.modelExecutionTime = model_execution_time
action = driving_model_data.action
action.desiredCurvature = desired_curv
action.desiredCurvature = float(net_output_data['desired_curvature'][0,0])
modelV2 = extended_msg.modelV2
modelV2.frameId = vipc_frame_id
@@ -120,7 +100,7 @@ def fill_model_msg(base_msg: capnp._DynamicStructBuilder, extended_msg: capnp._D
# lateral planning
action = modelV2.action
action.desiredCurvature = desired_curv
action.desiredCurvature = float(net_output_data['desired_curvature'][0,0])
# times at X_IDXS according to model plan
PLAN_T_IDXS = [np.nan] * ModelConstants.IDX_N
@@ -150,7 +130,10 @@ def fill_model_msg(base_msg: capnp._DynamicStructBuilder, extended_msg: capnp._D
modelV2.laneLineProbs = net_output_data['lane_lines_prob'][0,1::2].tolist()
lane_line_meta = driving_model_data.laneLineMeta
fill_lane_line_meta(lane_line_meta, modelV2.laneLines, modelV2.laneLineProbs)
lane_line_meta.leftY = modelV2.laneLines[1].y[0]
lane_line_meta.leftProb = modelV2.laneLineProbs[1]
lane_line_meta.rightY = modelV2.laneLines[2].y[0]
lane_line_meta.rightProb = modelV2.laneLineProbs[2]
# road edges
modelV2.init('roadEdges', 2)

View File

@@ -17,12 +17,13 @@ from openpilot.common.realtime import config_realtime_process
from openpilot.common.transformations.camera import DEVICE_CAMERAS
from openpilot.common.transformations.model import get_warp_matrix
from openpilot.system import sentry
from openpilot.selfdrive.car.card import convert_to_capnp
from openpilot.selfdrive.controls.lib.desire_helper import DesireHelper
from openpilot.sunnypilot.modeld.runners import ModelRunner, Runtime
from openpilot.sunnypilot.modeld.parse_model_outputs import Parser
from openpilot.sunnypilot.modeld.fill_model_msg import fill_model_msg, fill_pose_msg, PublishState
from openpilot.sunnypilot.modeld.constants import ModelConstants
from openpilot.sunnypilot.modeld.models.commonmodel_pyx import DrivingModelFrame, CLContext
from openpilot.sunnypilot.modeld.models.commonmodel_pyx import ModelFrame, CLContext
PROCESS_NAME = "sunnypilot.modeld.modeld"
SEND_RAW_PRED = os.getenv('SEND_RAW_PRED')
@@ -33,7 +34,6 @@ MODEL_PATHS = {
METADATA_PATH = Path(__file__).parent / 'models/supercombo_metadata.pkl'
class FrameMeta:
frame_id: int = 0
timestamp_sof: int = 0
@@ -44,24 +44,22 @@ class FrameMeta:
self.frame_id, self.timestamp_sof, self.timestamp_eof = vipc.frame_id, vipc.timestamp_sof, vipc.timestamp_eof
class ModelState:
frame: DrivingModelFrame
wide_frame: DrivingModelFrame
frame: ModelFrame
wide_frame: ModelFrame
inputs: dict[str, np.ndarray]
output: np.ndarray
prev_desire: np.ndarray # for tracking the rising edge of the pulse
model: ModelRunner
def __init__(self, context: CLContext):
self.frame = DrivingModelFrame(context)
self.wide_frame = DrivingModelFrame(context)
self.frame = ModelFrame(context)
self.wide_frame = ModelFrame(context)
self.prev_desire = np.zeros(ModelConstants.DESIRE_LEN, dtype=np.float32)
self.full_features_20Hz = np.zeros((ModelConstants.FULL_HISTORY_BUFFER_LEN, ModelConstants.FEATURE_LEN), dtype=np.float32)
self.desire_20Hz = np.zeros((ModelConstants.FULL_HISTORY_BUFFER_LEN + 1, ModelConstants.DESIRE_LEN), dtype=np.float32)
# img buffers are managed in openCL transform code
self.inputs = {
'desire': np.zeros(ModelConstants.DESIRE_LEN * (ModelConstants.HISTORY_BUFFER_LEN+1), dtype=np.float32),
'traffic_convention': np.zeros(ModelConstants.TRAFFIC_CONVENTION_LEN, dtype=np.float32),
'lateral_control_params': np.zeros(ModelConstants.LATERAL_CONTROL_PARAMS_LEN, dtype=np.float32),
'prev_desired_curv': np.zeros(ModelConstants.PREV_DESIRED_CURV_LEN * (ModelConstants.HISTORY_BUFFER_LEN+1), dtype=np.float32),
'features_buffer': np.zeros(ModelConstants.HISTORY_BUFFER_LEN * ModelConstants.FEATURE_LEN, dtype=np.float32),
}
@@ -89,17 +87,17 @@ class ModelState:
inputs: dict[str, np.ndarray], prepare_only: bool) -> dict[str, np.ndarray] | None:
# Model decides when action is completed, so desire input is just a pulse triggered on rising edge
inputs['desire'][0] = 0
new_desire = np.where(inputs['desire'] - self.prev_desire > .99, inputs['desire'], 0)
self.inputs['desire'][:-ModelConstants.DESIRE_LEN] = self.inputs['desire'][ModelConstants.DESIRE_LEN:]
self.inputs['desire'][-ModelConstants.DESIRE_LEN:] = np.where(inputs['desire'] - self.prev_desire > .99, inputs['desire'], 0)
self.prev_desire[:] = inputs['desire']
self.desire_20Hz[:-1] = self.desire_20Hz[1:]
self.desire_20Hz[-1] = new_desire
self.inputs['desire'][:] = self.desire_20Hz.reshape((25,4,-1)).max(axis=1).flatten()
self.inputs['traffic_convention'][:] = inputs['traffic_convention']
self.inputs['lateral_control_params'][:] = inputs['lateral_control_params']
# if getCLBuffer is not None, frame will be None
self.model.setInputBuffer("input_imgs", self.frame.prepare(buf, transform.flatten(), self.model.getCLBuffer("input_imgs")))
self.model.setInputBuffer("big_input_imgs", self.wide_frame.prepare(wbuf, transform_wide.flatten(), self.model.getCLBuffer("big_input_imgs")))
if wbuf is not None:
self.model.setInputBuffer("big_input_imgs", self.wide_frame.prepare(wbuf, transform_wide.flatten(), self.model.getCLBuffer("big_input_imgs")))
if prepare_only:
return None
@@ -107,11 +105,10 @@ class ModelState:
self.model.execute()
outputs = self.parser.parse_outputs(self.slice_outputs(self.output))
self.full_features_20Hz[:-1] = self.full_features_20Hz[1:]
self.full_features_20Hz[-1] = outputs['hidden_state'][0, :]
idxs = np.arange(-4,-100,-4)[::-1]
self.inputs['features_buffer'][:] = self.full_features_20Hz[idxs].flatten()
self.inputs['features_buffer'][:-ModelConstants.FEATURE_LEN] = self.inputs['features_buffer'][ModelConstants.FEATURE_LEN:]
self.inputs['features_buffer'][-ModelConstants.FEATURE_LEN:] = outputs['hidden_state'][0, :]
self.inputs['prev_desired_curv'][:-ModelConstants.PREV_DESIRED_CURV_LEN] = self.inputs['prev_desired_curv'][ModelConstants.PREV_DESIRED_CURV_LEN:]
self.inputs['prev_desired_curv'][-ModelConstants.PREV_DESIRED_CURV_LEN:] = outputs['desired_curvature'][0, :]
return outputs
@@ -174,9 +171,10 @@ def main(demo=False):
if demo:
CP = get_demo_car_params()
CP = convert_to_capnp(get_demo_car_params())
else:
CP = messaging.log_from_bytes(params.get("CarParams", block=True), car.CarParams)
cloudlog.info("modeld got CarParams: %s", CP.carName)
# TODO this needs more thought, use .2s extra for now to estimate other delays
@@ -221,7 +219,7 @@ def main(demo=False):
desire = DH.desire
is_rhd = sm["driverMonitoringState"].isRHD
frame_id = sm["roadCameraState"].frameId
v_ego = max(sm["carState"].vEgo, 0.)
lateral_control_params = np.array([sm["carState"].vEgo, steer_delay], dtype=np.float32)
if sm.updated["liveCalibration"] and sm.seen['roadCameraState'] and sm.seen['deviceState']:
device_from_calib_euler = np.array(sm["liveCalibration"].rpyCalib, dtype=np.float32)
dc = DEVICE_CAMERAS[(str(sm['deviceState'].deviceType), str(sm['roadCameraState'].sensor))]
@@ -252,6 +250,7 @@ def main(demo=False):
inputs:dict[str, np.ndarray] = {
'desire': vec_desire,
'traffic_convention': traffic_convention,
'lateral_control_params': lateral_control_params,
}
mt1 = time.perf_counter()
@@ -263,8 +262,7 @@ def main(demo=False):
modelv2_send = messaging.new_message('modelV2')
drivingdata_send = messaging.new_message('drivingModelData')
posenet_send = messaging.new_message('cameraOdometry')
fill_model_msg(drivingdata_send, modelv2_send, model_output, v_ego, steer_delay,
publish_state, meta_main.frame_id, meta_extra.frame_id, frame_id,
fill_model_msg(drivingdata_send, modelv2_send, model_output, publish_state, meta_main.frame_id, meta_extra.frame_id, frame_id,
frame_drop_ratio, meta_main.timestamp_eof, model_execution_time, live_calib_seen)
desire_state = modelv2_send.modelV2.meta.desireState

View File

@@ -1,62 +0,0 @@
## Neural networks in openpilot
To view the architecture of the ONNX networks, you can use [netron](https://netron.app/)
## Supercombo
### Supercombo input format (Full size: 799906 x float32)
* **image stream**
* Two consecutive images (256 * 512 * 3 in RGB) recorded at 20 Hz : 393216 = 2 * 6 * 128 * 256
* Each 256 * 512 image is represented in YUV420 with 6 channels : 6 * 128 * 256
* Channels 0,1,2,3 represent the full-res Y channel and are represented in numpy as Y[::2, ::2], Y[::2, 1::2], Y[1::2, ::2], and Y[1::2, 1::2]
* Channel 4 represents the half-res U channel
* Channel 5 represents the half-res V channel
* **wide image stream**
* Two consecutive images (256 * 512 * 3 in RGB) recorded at 20 Hz : 393216 = 2 * 6 * 128 * 256
* Each 256 * 512 image is represented in YUV420 with 6 channels : 6 * 128 * 256
* Channels 0,1,2,3 represent the full-res Y channel and are represented in numpy as Y[::2, ::2], Y[::2, 1::2], Y[1::2, ::2], and Y[1::2, 1::2]
* Channel 4 represents the half-res U channel
* Channel 5 represents the half-res V channel
* **desire**
* one-hot encoded buffer to command model to execute certain actions, bit needs to be sent for the past 5 seconds (at 20FPS) : 100 * 8
* **traffic convention**
* one-hot encoded vector to tell model whether traffic is right-hand or left-hand traffic : 2
* **feature buffer**
* A buffer of intermediate features that gets appended to the current feature to form a 5 seconds temporal context (at 20FPS) : 99 * 512
### Supercombo output format (Full size: XXX x float32)
Read [here](https://github.com/commaai/openpilot/blob/90af436a121164a51da9fa48d093c29f738adf6a/selfdrive/modeld/models/driving.h#L236) for more.
## Driver Monitoring Model
* .onnx model can be run with onnx runtimes
* .dlc file is a pre-quantized model and only runs on qualcomm DSPs
### input format
* single image W = 1440 H = 960 luminance channel (Y) from the planar YUV420 format:
* full input size is 1440 * 960 = 1382400
* normalized ranging from 0.0 to 1.0 in float32 (onnx runner) or ranging from 0 to 255 in uint8 (snpe runner)
* camera calibration angles (roll, pitch, yaw) from liveCalibration: 3 x float32 inputs
### output format
* 84 x float32 outputs = 2 + 41 * 2 ([parsing example](https://github.com/commaai/openpilot/blob/22ce4e17ba0d3bfcf37f8255a4dd1dc683fe0c38/selfdrive/modeld/models/dmonitoring.cc#L33))
* for each person in the front seats (2 * 41)
* face pose: 12 = 6 + 6
* face orientation [pitch, yaw, roll] in camera frame: 3
* face position [dx, dy] relative to image center: 2
* normalized face size: 1
* standard deviations for above outputs: 6
* face visible probability: 1
* eyes: 20 = (8 + 1) + (8 + 1) + 1 + 1
* eye position and size, and their standard deviations: 8
* eye visible probability: 1
* eye closed probability: 1
* wearing sunglasses probability: 1
* face occluded probability: 1
* touching wheel probability: 1
* paying attention probability: 1
* (deprecated) distracted probabilities: 2
* using phone probability: 1
* distracted probability: 1
* common outputs 2
* poor camera vision probability: 1
* left hand drive probability: 1

View File

View File

@@ -1,69 +1,50 @@
#include "sunnypilot/modeld/models/commonmodel.h"
#include <cassert>
#include <cmath>
#include <cstring>
#include "common/clutil.h"
DrivingModelFrame::DrivingModelFrame(cl_device_id device_id, cl_context context) : ModelFrame(device_id, context) {
input_frames = std::make_unique<uint8_t[]>(buf_size);
//input_frames_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, buf_size, NULL, &err));
img_buffer_20hz_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, 5*frame_size_bytes, NULL, &err));
region.origin = 4 * frame_size_bytes;
region.size = frame_size_bytes;
last_img_cl = CL_CHECK_ERR(clCreateSubBuffer(img_buffer_20hz_cl, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, &region, &err));
ModelFrame::ModelFrame(cl_device_id device_id, cl_context context) {
input_frames = std::make_unique<float[]>(buf_size);
q = CL_CHECK_ERR(clCreateCommandQueue(context, device_id, 0, &err));
y_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, MODEL_WIDTH * MODEL_HEIGHT, NULL, &err));
u_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, (MODEL_WIDTH / 2) * (MODEL_HEIGHT / 2), NULL, &err));
v_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, (MODEL_WIDTH / 2) * (MODEL_HEIGHT / 2), NULL, &err));
net_input_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, MODEL_FRAME_SIZE * sizeof(float), NULL, &err));
transform_init(&transform, context, device_id);
loadyuv_init(&loadyuv, context, device_id, MODEL_WIDTH, MODEL_HEIGHT);
init_transform(device_id, context, MODEL_WIDTH, MODEL_HEIGHT);
}
uint8_t* DrivingModelFrame::prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection, cl_mem* output) {
run_transform(yuv_cl, MODEL_WIDTH, MODEL_HEIGHT, frame_width, frame_height, frame_stride, frame_uv_offset, projection);
for (int i = 0; i < 4; i++) {
CL_CHECK(clEnqueueCopyBuffer(q, img_buffer_20hz_cl, img_buffer_20hz_cl, (i+1)*frame_size_bytes, i*frame_size_bytes, frame_size_bytes, 0, nullptr, nullptr));
}
loadyuv_queue(&loadyuv, q, y_cl, u_cl, v_cl, last_img_cl);
float* ModelFrame::prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3 &projection, cl_mem *output) {
transform_queue(&this->transform, q,
yuv_cl, frame_width, frame_height, frame_stride, frame_uv_offset,
y_cl, u_cl, v_cl, MODEL_WIDTH, MODEL_HEIGHT, projection);
if (output == NULL) {
CL_CHECK(clEnqueueReadBuffer(q, img_buffer_20hz_cl, CL_TRUE, 0, frame_size_bytes, &input_frames[0], 0, nullptr, nullptr));
CL_CHECK(clEnqueueReadBuffer(q, last_img_cl, CL_TRUE, 0, frame_size_bytes, &input_frames[MODEL_FRAME_SIZE], 0, nullptr, nullptr));
loadyuv_queue(&loadyuv, q, y_cl, u_cl, v_cl, net_input_cl);
std::memmove(&input_frames[0], &input_frames[MODEL_FRAME_SIZE], sizeof(float) * MODEL_FRAME_SIZE);
CL_CHECK(clEnqueueReadBuffer(q, net_input_cl, CL_TRUE, 0, MODEL_FRAME_SIZE * sizeof(float), &input_frames[MODEL_FRAME_SIZE], 0, nullptr, nullptr));
clFinish(q);
return &input_frames[0];
} else {
copy_queue(&loadyuv, q, img_buffer_20hz_cl, *output, 0, 0, frame_size_bytes);
copy_queue(&loadyuv, q, last_img_cl, *output, 0, frame_size_bytes, frame_size_bytes);
loadyuv_queue(&loadyuv, q, y_cl, u_cl, v_cl, *output, true);
// NOTE: Since thneed is using a different command queue, this clFinish is needed to ensure the image is ready.
clFinish(q);
return NULL;
}
}
DrivingModelFrame::~DrivingModelFrame() {
deinit_transform();
ModelFrame::~ModelFrame() {
transform_destroy(&transform);
loadyuv_destroy(&loadyuv);
CL_CHECK(clReleaseMemObject(img_buffer_20hz_cl));
CL_CHECK(clReleaseMemObject(last_img_cl));
CL_CHECK(clReleaseMemObject(net_input_cl));
CL_CHECK(clReleaseMemObject(v_cl));
CL_CHECK(clReleaseMemObject(u_cl));
CL_CHECK(clReleaseMemObject(y_cl));
CL_CHECK(clReleaseCommandQueue(q));
}
MonitoringModelFrame::MonitoringModelFrame(cl_device_id device_id, cl_context context) : ModelFrame(device_id, context) {
input_frames = std::make_unique<uint8_t[]>(buf_size);
//input_frame_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, buf_size, NULL, &err));
init_transform(device_id, context, MODEL_WIDTH, MODEL_HEIGHT);
}
uint8_t* MonitoringModelFrame::prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection, cl_mem* output) {
run_transform(yuv_cl, MODEL_WIDTH, MODEL_HEIGHT, frame_width, frame_height, frame_stride, frame_uv_offset, projection);
CL_CHECK(clEnqueueReadBuffer(q, y_cl, CL_TRUE, 0, MODEL_FRAME_SIZE * sizeof(uint8_t), input_frames.get(), 0, nullptr, nullptr));
clFinish(q);
//return &y_cl;
return input_frames.get();
}
MonitoringModelFrame::~MonitoringModelFrame() {
deinit_transform();
CL_CHECK(clReleaseCommandQueue(q));
}
}

View File

@@ -2,7 +2,6 @@
#include <cfloat>
#include <cstdlib>
#include <cassert>
#include <memory>
@@ -14,85 +13,24 @@
#endif
#include "common/mat.h"
#include "selfdrive/modeld/transforms/loadyuv.h"
#include "selfdrive/modeld/transforms/transform.h"
#include "sunnypilot/modeld/transforms/loadyuv.h"
#include "sunnypilot/modeld/transforms/transform.h"
class ModelFrame {
public:
ModelFrame(cl_device_id device_id, cl_context context) {
q = CL_CHECK_ERR(clCreateCommandQueue(context, device_id, 0, &err));
}
virtual ~ModelFrame() {}
virtual uint8_t* prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection, cl_mem* output) { return NULL; }
/*
uint8_t* buffer_from_cl(cl_mem *in_frames, int buffer_size) {
CL_CHECK(clEnqueueReadBuffer(q, *in_frames, CL_TRUE, 0, buffer_size, input_frames.get(), 0, nullptr, nullptr));
clFinish(q);
return &input_frames[0];
}
*/
int MODEL_WIDTH;
int MODEL_HEIGHT;
int MODEL_FRAME_SIZE;
int buf_size;
protected:
cl_mem y_cl, u_cl, v_cl;
Transform transform;
cl_command_queue q;
std::unique_ptr<uint8_t[]> input_frames;
void init_transform(cl_device_id device_id, cl_context context, int model_width, int model_height) {
y_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, model_width * model_height, NULL, &err));
u_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, (model_width / 2) * (model_height / 2), NULL, &err));
v_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, (model_width / 2) * (model_height / 2), NULL, &err));
transform_init(&transform, context, device_id);
}
void deinit_transform() {
transform_destroy(&transform);
CL_CHECK(clReleaseMemObject(v_cl));
CL_CHECK(clReleaseMemObject(u_cl));
CL_CHECK(clReleaseMemObject(y_cl));
}
void run_transform(cl_mem yuv_cl, int model_width, int model_height, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection) {
transform_queue(&transform, q,
yuv_cl, frame_width, frame_height, frame_stride, frame_uv_offset,
y_cl, u_cl, v_cl, model_width, model_height, projection);
}
};
class DrivingModelFrame : public ModelFrame {
public:
DrivingModelFrame(cl_device_id device_id, cl_context context);
~DrivingModelFrame();
uint8_t* prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection, cl_mem* output);
ModelFrame(cl_device_id device_id, cl_context context);
~ModelFrame();
float* prepare(cl_mem yuv_cl, int width, int height, int frame_stride, int frame_uv_offset, const mat3& transform, cl_mem *output);
const int MODEL_WIDTH = 512;
const int MODEL_HEIGHT = 256;
const int MODEL_FRAME_SIZE = MODEL_WIDTH * MODEL_HEIGHT * 3 / 2;
const int buf_size = MODEL_FRAME_SIZE * 2;
const size_t frame_size_bytes = MODEL_FRAME_SIZE * sizeof(uint8_t);
private:
Transform transform;
LoadYUVState loadyuv;
cl_mem img_buffer_20hz_cl, last_img_cl;//, input_frames_cl;
cl_buffer_region region;
};
class MonitoringModelFrame : public ModelFrame {
public:
MonitoringModelFrame(cl_device_id device_id, cl_context context);
~MonitoringModelFrame();
uint8_t* prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection, cl_mem* output);
const int MODEL_WIDTH = 1440;
const int MODEL_HEIGHT = 960;
const int MODEL_FRAME_SIZE = MODEL_WIDTH * MODEL_HEIGHT;
const int buf_size = MODEL_FRAME_SIZE;
private:
// cl_mem input_frame_cl;
cl_command_queue q;
cl_mem y_cl, u_cl, v_cl, net_input_cl;
std::unique_ptr<float[]> input_frames;
};

View File

@@ -14,13 +14,5 @@ cdef extern from "common/clutil.h":
cdef extern from "sunnypilot/modeld/models/commonmodel.h":
cppclass ModelFrame:
int buf_size
# unsigned char * buffer_from_cl(cl_mem*, int);
unsigned char * prepare(cl_mem, int, int, int, int, mat3, cl_mem*)
cppclass DrivingModelFrame:
int buf_size
DrivingModelFrame(cl_device_id, cl_context)
cppclass MonitoringModelFrame:
int buf_size
MonitoringModelFrame(cl_device_id, cl_context)
ModelFrame(cl_device_id, cl_context)
float * prepare(cl_mem, int, int, int, int, mat3, cl_mem*)

View File

@@ -4,12 +4,11 @@
import numpy as np
cimport numpy as cnp
from libc.string cimport memcpy
from libc.stdint cimport uintptr_t
from msgq.visionipc.visionipc cimport cl_mem
from msgq.visionipc.visionipc_pyx cimport VisionBuf, CLContext as BaseCLContext
from sunnypilot.modeld.models.commonmodel cimport CL_DEVICE_TYPE_DEFAULT, cl_get_device_id, cl_create_context
from sunnypilot.modeld.models.commonmodel cimport mat3, ModelFrame as cppModelFrame, DrivingModelFrame as cppDrivingModelFrame, MonitoringModelFrame as cppMonitoringModelFrame
from .commonmodel cimport CL_DEVICE_TYPE_DEFAULT, cl_get_device_id, cl_create_context
from .commonmodel cimport mat3, ModelFrame as cppModelFrame
cdef class CLContext(BaseCLContext):
@@ -24,17 +23,11 @@ cdef class CLMem:
mem.mem = <cl_mem*> cmem
return mem
@property
def mem_address(self):
return <uintptr_t>(self.mem)
def cl_from_visionbuf(VisionBuf buf):
return CLMem.create(<void*>&buf.buf.buf_cl)
cdef class ModelFrame:
cdef cppModelFrame * frame
cdef int buf_size
def __cinit__(self, CLContext context):
self.frame = new cppModelFrame(context.device_id, context.context)
def __dealloc__(self):
del self.frame
@@ -42,35 +35,11 @@ cdef class ModelFrame:
def prepare(self, VisionBuf buf, float[:] projection, CLMem output):
cdef mat3 cprojection
memcpy(cprojection.v, &projection[0], 9*sizeof(float))
cdef unsigned char * data
cdef float * data
if output is None:
data = self.frame.prepare(buf.buf.buf_cl, buf.width, buf.height, buf.stride, buf.uv_offset, cprojection, NULL)
else:
data = self.frame.prepare(buf.buf.buf_cl, buf.width, buf.height, buf.stride, buf.uv_offset, cprojection, output.mem)
if not data:
return None
return np.asarray(<cnp.uint8_t[:self.buf_size]> data)
# return CLMem.create(data)
# def buffer_from_cl(self, CLMem in_frames):
# cdef unsigned char * data2
# data2 = self.frame.buffer_from_cl(in_frames.mem, self.buf_size)
# return np.asarray(<cnp.uint8_t[:self.buf_size]> data2)
cdef class DrivingModelFrame(ModelFrame):
cdef cppDrivingModelFrame * _frame
def __cinit__(self, CLContext context):
self._frame = new cppDrivingModelFrame(context.device_id, context.context)
self.frame = <cppModelFrame*>(self._frame)
self.buf_size = self._frame.buf_size
cdef class MonitoringModelFrame(ModelFrame):
cdef cppMonitoringModelFrame * _frame
def __cinit__(self, CLContext context):
self._frame = new cppMonitoringModelFrame(context.device_id, context.context)
self.frame = <cppModelFrame*>(self._frame)
self.buf_size = self._frame.buf_size
return np.asarray(<cnp.float32_t[:self.frame.buf_size]> data)

Binary file not shown.

Binary file not shown.

View File

@@ -96,6 +96,8 @@ class Parser:
out_shape=(ModelConstants.LEAD_TRAJ_LEN,ModelConstants.LEAD_WIDTH))
if 'lat_planner_solution' in outs:
self.parse_mdn('lat_planner_solution', outs, in_N=0, out_N=0, out_shape=(ModelConstants.IDX_N,ModelConstants.LAT_PLANNER_SOLUTION_WIDTH))
if 'desired_curvature' in outs:
self.parse_mdn('desired_curvature', outs, in_N=0, out_N=0, out_shape=(ModelConstants.DESIRED_CURV_WIDTH,))
for k in ['lead_prob', 'lane_lines_prob', 'meta']:
self.parse_binary_crossentropy(k, outs)
self.parse_categorical_crossentropy('desire_state', outs, out_shape=(ModelConstants.DESIRE_PRED_WIDTH,))

View File

@@ -1,12 +1,39 @@
import os
import onnx
import itertools
import os
import sys
import numpy as np
from typing import Any
from openpilot.sunnypilot.modeld.runners.runmodel_pyx import RunModel
from openpilot.sunnypilot.modeld.runners.ort_helpers import convert_fp16_to_fp32, ORT_TYPES_TO_NP_TYPES
ORT_TYPES_TO_NP_TYPES = {'tensor(float16)': np.float16, 'tensor(float)': np.float32, 'tensor(uint8)': np.uint8}
def attributeproto_fp16_to_fp32(attr):
float32_list = np.frombuffer(attr.raw_data, dtype=np.float16)
attr.data_type = 1
attr.raw_data = float32_list.astype(np.float32).tobytes()
def convert_fp16_to_fp32(onnx_path_or_bytes):
if isinstance(onnx_path_or_bytes, bytes):
model = onnx.load_from_string(onnx_path_or_bytes)
elif isinstance(onnx_path_or_bytes, str):
model = onnx.load(onnx_path_or_bytes)
for i in model.graph.initializer:
if i.data_type == 10:
attributeproto_fp16_to_fp32(i)
for i in itertools.chain(model.graph.input, model.graph.output):
if i.type.tensor_type.elem_type == 10:
i.type.tensor_type.elem_type = 1
for i in model.graph.node:
if i.op_type == 'Cast' and i.attribute[0].i == 10:
i.attribute[0].i = 1
for a in i.attribute:
if hasattr(a, 't'):
if a.t.data_type == 10:
attributeproto_fp16_to_fp32(a.t)
return model.SerializeToString()
def create_ort_session(path, fp16_to_fp32):
os.environ["OMP_NUM_THREADS"] = "4"
@@ -22,14 +49,14 @@ def create_ort_session(path, fp16_to_fp32):
provider = 'OpenVINOExecutionProvider'
elif 'CUDAExecutionProvider' in ort.get_available_providers() and 'ONNXCPU' not in os.environ:
options.intra_op_num_threads = 2
provider = ('CUDAExecutionProvider', {'cudnn_conv_algo_search': 'EXHAUSTIVE'})
provider = ('CUDAExecutionProvider', {'cudnn_conv_algo_search': 'DEFAULT'})
else:
options.intra_op_num_threads = 2
options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
provider = 'CPUExecutionProvider'
model_data = convert_fp16_to_fp32(onnx.load(path)) if fp16_to_fp32 else path
model_data = convert_fp16_to_fp32(path) if fp16_to_fp32 else path
print("Onnx selected provider: ", [provider], file=sys.stderr)
ort_session = ort.InferenceSession(model_data, options, providers=[provider])
print("Onnx using ", ort_session.get_providers(), file=sys.stderr)

View File

@@ -1,36 +0,0 @@
import onnx
import onnxruntime as ort
import numpy as np
import itertools
ORT_TYPES_TO_NP_TYPES = {'tensor(float16)': np.float16, 'tensor(float)': np.float32, 'tensor(uint8)': np.uint8}
def attributeproto_fp16_to_fp32(attr):
float32_list = np.frombuffer(attr.raw_data, dtype=np.float16)
attr.data_type = 1
attr.raw_data = float32_list.astype(np.float32).tobytes()
def convert_fp16_to_fp32(model):
for i in model.graph.initializer:
if i.data_type == 10:
attributeproto_fp16_to_fp32(i)
for i in itertools.chain(model.graph.input, model.graph.output):
if i.type.tensor_type.elem_type == 10:
i.type.tensor_type.elem_type = 1
for i in model.graph.node:
if i.op_type == 'Cast' and i.attribute[0].i == 10:
i.attribute[0].i = 1
for a in i.attribute:
if hasattr(a, 't'):
if a.t.data_type == 10:
attributeproto_fp16_to_fp32(a.t)
return model.SerializeToString()
def make_onnx_cpu_runner(model_path):
options = ort.SessionOptions()
options.intra_op_num_threads = 4
options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
model_data = convert_fp16_to_fp32(onnx.load(model_path))
return ort.InferenceSession(model_data, options, providers=['CPUExecutionProvider'])

View File

@@ -1,4 +1,4 @@
#include "selfdrive/modeld/transforms/loadyuv.h"
#include "sunnypilot/modeld/transforms/loadyuv.h"
#include <cassert>
#include <cstdio>
@@ -33,8 +33,17 @@ void loadyuv_destroy(LoadYUVState* s) {
void loadyuv_queue(LoadYUVState* s, cl_command_queue q,
cl_mem y_cl, cl_mem u_cl, cl_mem v_cl,
cl_mem out_cl) {
cl_mem out_cl, bool do_shift) {
cl_int global_out_off = 0;
if (do_shift) {
// shift the image in slot 1 to slot 0, then place the new image in slot 1
global_out_off += (s->width*s->height) + (s->width/2)*(s->height/2)*2;
CL_CHECK(clSetKernelArg(s->copy_krnl, 0, sizeof(cl_mem), &out_cl));
CL_CHECK(clSetKernelArg(s->copy_krnl, 1, sizeof(cl_int), &global_out_off));
const size_t copy_work_size = global_out_off/8;
CL_CHECK(clEnqueueNDRangeKernel(q, s->copy_krnl, 1, NULL,
&copy_work_size, NULL, 0, 0, NULL));
}
CL_CHECK(clSetKernelArg(s->loadys_krnl, 0, sizeof(cl_mem), &y_cl));
CL_CHECK(clSetKernelArg(s->loadys_krnl, 1, sizeof(cl_mem), &out_cl));
@@ -63,14 +72,3 @@ void loadyuv_queue(LoadYUVState* s, cl_command_queue q,
CL_CHECK(clEnqueueNDRangeKernel(q, s->loaduv_krnl, 1, NULL,
&loaduv_work_size, NULL, 0, 0, NULL));
}
void copy_queue(LoadYUVState* s, cl_command_queue q, cl_mem src, cl_mem dst,
size_t src_offset, size_t dst_offset, size_t size) {
CL_CHECK(clSetKernelArg(s->copy_krnl, 0, sizeof(cl_mem), &src));
CL_CHECK(clSetKernelArg(s->copy_krnl, 1, sizeof(cl_mem), &dst));
CL_CHECK(clSetKernelArg(s->copy_krnl, 2, sizeof(cl_int), &src_offset));
CL_CHECK(clSetKernelArg(s->copy_krnl, 3, sizeof(cl_int), &dst_offset));
const size_t copy_work_size = size/8;
CL_CHECK(clEnqueueNDRangeKernel(q, s->copy_krnl, 1, NULL,
&copy_work_size, NULL, 0, 0, NULL));
}

View File

@@ -1,7 +1,7 @@
#define UV_SIZE ((TRANSFORMED_WIDTH/2)*(TRANSFORMED_HEIGHT/2))
__kernel void loadys(__global uchar8 const * const Y,
__global uchar * out,
__global float * out,
int out_offset)
{
const int gid = get_global_id(0);
@@ -10,12 +10,13 @@ __kernel void loadys(__global uchar8 const * const Y,
const int ox = ois % TRANSFORMED_WIDTH;
const uchar8 ys = Y[gid];
const float8 ysf = convert_float8(ys);
// 02
// 13
__global uchar* outy0;
__global uchar* outy1;
__global float* outy0;
__global float* outy1;
if ((oy & 1) == 0) {
outy0 = out + out_offset; //y0
outy1 = out + out_offset + UV_SIZE*2; //y2
@@ -24,24 +25,23 @@ __kernel void loadys(__global uchar8 const * const Y,
outy1 = out + out_offset + UV_SIZE*3; //y3
}
vstore4(ys.s0246, 0, outy0 + (oy/2) * (TRANSFORMED_WIDTH/2) + ox/2);
vstore4(ys.s1357, 0, outy1 + (oy/2) * (TRANSFORMED_WIDTH/2) + ox/2);
vstore4(ysf.s0246, 0, outy0 + (oy/2) * (TRANSFORMED_WIDTH/2) + ox/2);
vstore4(ysf.s1357, 0, outy1 + (oy/2) * (TRANSFORMED_WIDTH/2) + ox/2);
}
__kernel void loaduv(__global uchar8 const * const in,
__global uchar8 * out,
__global float8 * out,
int out_offset)
{
const int gid = get_global_id(0);
const uchar8 inv = in[gid];
out[gid + out_offset / 8] = inv;
const float8 outv = convert_float8(inv);
out[gid + out_offset / 8] = outv;
}
__kernel void copy(__global uchar8 * in,
__global uchar8 * out,
int in_offset,
int out_offset)
__kernel void copy(__global float8 * inout,
int in_offset)
{
const int gid = get_global_id(0);
out[gid + out_offset / 8] = in[gid + in_offset / 8];
inout[gid] = inout[gid + in_offset / 8];
}

View File

@@ -13,8 +13,4 @@ void loadyuv_destroy(LoadYUVState* s);
void loadyuv_queue(LoadYUVState* s, cl_command_queue q,
cl_mem y_cl, cl_mem u_cl, cl_mem v_cl,
cl_mem out_cl);
void copy_queue(LoadYUVState* s, cl_command_queue q, cl_mem src, cl_mem dst,
size_t src_offset, size_t dst_offset, size_t size);
cl_mem out_cl, bool do_shift = false);

View File

@@ -1,4 +1,4 @@
#include "selfdrive/modeld/transforms/transform.h"
#include "sunnypilot/modeld/transforms/transform.h"
#include <cassert>
#include <cstring>