models: retain SNPE runner support (#522)

* tinygrad with snpe

* force with snpe to validate

* fix path

* fix more paths

* Adjust modeld execution logic based on active model runner

Introduced a check to conditionally execute `modeld` based on the active model runner. Added support for distinguishing between SNPE and TinyGrad runners using new helper functions and updated `custom.capnp` definitions. This change optimizes process management by ensuring compatibility with the selected model runner.

* Refactor modeld process function checks.

Introduce `is_stock_model` to clarify logic and replace direct uses of `is_snpe_model` where the stock model condition is needed. Additionally, rename the duplicate "modeld" process in sunnyPilot to "modeld_snpe" for clarity and consistency.

* ignore tg

* fix process name

* ruff

* fix thneed paths

* mypy

* remove our own

* use upstream compile3

* fix thneed

* try this

* Revert "remove our own"

This reverts commit 1cf4f57502.

* try using compile2.py again

* add back symlink

* fix path

* more fix

* wrong path again

* Revert "wrong path again"

This reverts commit f5301c19d5.

* update

* hardcode path to our submodule

* force path

* try this

* fix file name

* try this

* again

* Revert "again"

This reverts commit 17c8cd7376.

* Revert "try this"

This reverts commit 767f78bbcf.

* Revert "fix file name"

This reverts commit 485eef68da.

* Revert "try this"

This reverts commit 41fef87680.

* Revert "force path"

This reverts commit 5c3b408937.

* Revert "hardcode path to our submodule"

This reverts commit 5ee1950b6f.

* Revert "update"

This reverts commit fb313bd7fb.

* Reapply "wrong path again"

This reverts commit 309639aeb3.

* Revert "wrong path again"

This reverts commit f5301c19d5.

* Revert "more fix"

This reverts commit 23dd423e78.

* Revert "fix path"

This reverts commit 75d338f2bd.

* Revert "add back symlink"

This reverts commit 9f71ad0b8a.

* Revert "try using compile2.py again"

This reverts commit 914117d2e1.

* Reapply "remove our own"

This reverts commit b1996377b3.

* don't even compile anymore

* need it for default snpe model

* add to lfs

* bring onnx back for sim

* must add this back

* need this

---------

Co-authored-by: DevTekVE <devtekve@gmail.com>
This commit is contained in:
Jason Wen
2025-01-05 17:47:27 -05:00
committed by GitHub
parent c12c22eac7
commit f84d27c4ee
56 changed files with 2879 additions and 4 deletions

2
.gitattributes vendored
View File

@@ -3,6 +3,8 @@
# to move existing files into LFS:
# git add --renormalize .
*.onnx filter=lfs diff=lfs merge=lfs -text
*.thneed filter=lfs diff=lfs merge=lfs -text
*.pkl filter=lfs diff=lfs merge=lfs -text
*.svg filter=lfs diff=lfs merge=lfs -text
*.png filter=lfs diff=lfs merge=lfs -text
*.gif filter=lfs diff=lfs merge=lfs -text

1
.gitignore vendored
View File

@@ -74,6 +74,7 @@ comma*.sh
selfdrive/modeld/thneed/compile
selfdrive/modeld/models/*.thneed
selfdrive/modeld/models/*.pkl
sunnypilot/modeld/thneed/compile
*.bz2
*.zst

2
.gitmodules vendored
View File

@@ -15,4 +15,4 @@
url = https://github.com/commaai/teleoprtc
[submodule "tinygrad"]
path = tinygrad_repo
url = https://github.com/tinygrad/tinygrad.git
url = https://github.com/commaai/tinygrad.git

View File

@@ -396,6 +396,8 @@ SConscript(['third_party/SConscript'])
SConscript(['selfdrive/SConscript'])
SConscript(['sunnypilot/SConscript'])
if Dir('#tools/cabana/').exists() and GetOption('extras'):
SConscript(['tools/replay/SConscript'])
if arch != "larch64":

View File

@@ -64,6 +64,11 @@ struct ModelManagerSP @0xaedffd8f31e7b55d {
progress @1 :Float32;
eta @2 :UInt32;
}
enum Runner {
snpe @0;
tinygrad @1;
}
struct ModelBundle {
index @0 :UInt32;

View File

@@ -137,7 +137,7 @@ allow-direct-references = true
[tool.pytest.ini_options]
minversion = "6.0"
addopts = "--ignore=openpilot/ --ignore=opendbc/ --ignore=panda/ --ignore=rednose_repo/ --ignore=tinygrad_repo/ --ignore=teleoprtc_repo/ --ignore=msgq/ -Werror --strict-config --strict-markers --durations=10 -n auto --dist=loadgroup"
addopts = "--ignore=openpilot/ --ignore=opendbc/ --ignore=panda/ --ignore=rednose_repo/ --ignore=tinygrad_repo/ --ignore=teleoprtc_repo/ --ignore=msgq/ --ignore=sunnypilot/tinygrad_repo/ -Werror --strict-config --strict-markers --durations=10 -n auto --dist=loadgroup"
cpp_files = "test_*"
cpp_harness = "selfdrive/test/cpp_harness.py"
python_files = "test_*.py"

1
sunnypilot/SConscript Normal file
View File

@@ -0,0 +1 @@
SConscript(['modeld/SConscript'])

1
sunnypilot/modeld/.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
*_pyx.cpp

View File

@@ -0,0 +1,58 @@
import glob
Import('env', 'envCython', 'arch', 'cereal', 'messaging', 'common', 'gpucommon', 'visionipc', 'transformations')
lenv = env.Clone()
lenvCython = envCython.Clone()
libs = [cereal, messaging, visionipc, gpucommon, common, 'capnp', 'kj', 'pthread']
frameworks = []
common_src = [
"models/commonmodel.cc",
"transforms/loadyuv.cc",
"transforms/transform.cc",
]
thneed_src_common = [
"thneed/thneed_common.cc",
"thneed/serialize.cc",
]
thneed_src_qcom = thneed_src_common + ["thneed/thneed_qcom2.cc"]
thneed_src_pc = thneed_src_common + ["thneed/thneed_pc.cc"]
thneed_src = thneed_src_qcom if arch == "larch64" else thneed_src_pc
# SNPE except on Mac and ARM Linux
snpe_lib = []
if arch != "Darwin" and arch != "aarch64":
common_src += ['runners/snpemodel.cc']
snpe_lib += ['SNPE']
# OpenCL is a framework on Mac
if arch == "Darwin":
frameworks += ['OpenCL']
else:
libs += ['OpenCL']
# Set path definitions
for pathdef, fn in {'TRANSFORM': 'transforms/transform.cl', 'LOADYUV': 'transforms/loadyuv.cl'}.items():
for xenv in (lenv, lenvCython):
xenv['CXXFLAGS'].append(f'-D{pathdef}_PATH=\\"{File(fn).abspath}\\"')
# Compile cython
snpe_rpath_qcom = "/data/pythonpath/third_party/snpe/larch64"
snpe_rpath_pc = f"{Dir('#').abspath}/third_party/snpe/x86_64-linux-clang"
snpe_rpath = lenvCython['RPATH'] + [snpe_rpath_qcom if arch == "larch64" else snpe_rpath_pc]
cython_libs = envCython["LIBS"] + libs
snpemodel_lib = lenv.Library('snpemodel', ['runners/snpemodel.cc'])
commonmodel_lib = lenv.Library('commonmodel', common_src)
lenvCython.Program('runners/runmodel_pyx.so', 'runners/runmodel_pyx.pyx', LIBS=cython_libs, FRAMEWORKS=frameworks)
lenvCython.Program('runners/snpemodel_pyx.so', 'runners/snpemodel_pyx.pyx', LIBS=[snpemodel_lib, snpe_lib, *cython_libs], FRAMEWORKS=frameworks, RPATH=snpe_rpath)
lenvCython.Program('models/commonmodel_pyx.so', 'models/commonmodel_pyx.pyx', LIBS=[commonmodel_lib, *cython_libs], FRAMEWORKS=frameworks)
if arch == 'larch64' or GetOption('pc_thneed'):
thneed_lib = env.SharedLibrary('thneed', thneed_src, LIBS=[gpucommon, common, 'OpenCL', 'dl'])
thneedmodel_lib = env.Library('thneedmodel', ['runners/thneedmodel.cc'])
lenvCython.Program('runners/thneedmodel_pyx.so', 'runners/thneedmodel_pyx.pyx', LIBS=envCython["LIBS"]+[thneedmodel_lib, thneed_lib, gpucommon, common, 'dl', 'OpenCL'])

View File

View File

@@ -0,0 +1,86 @@
import numpy as np
def index_function(idx, max_val=192, max_idx=32):
return (max_val) * ((idx/max_idx)**2)
class ModelConstants:
# time and distance indices
IDX_N = 33
T_IDXS = [index_function(idx, max_val=10.0) for idx in range(IDX_N)]
X_IDXS = [index_function(idx, max_val=192.0) for idx in range(IDX_N)]
LEAD_T_IDXS = [0., 2., 4., 6., 8., 10.]
LEAD_T_OFFSETS = [0., 2., 4.]
META_T_IDXS = [2., 4., 6., 8., 10.]
# model inputs constants
MODEL_FREQ = 20
FEATURE_LEN = 512
FULL_HISTORY_BUFFER_LEN = 99
HISTORY_BUFFER_LEN = 24
DESIRE_LEN = 8
TRAFFIC_CONVENTION_LEN = 2
LAT_PLANNER_STATE_LEN = 4
LATERAL_CONTROL_PARAMS_LEN = 2
PREV_DESIRED_CURV_LEN = 1
# model outputs constants
FCW_THRESHOLDS_5MS2 = np.array([.05, .05, .15, .15, .15], dtype=np.float32)
FCW_THRESHOLDS_3MS2 = np.array([.7, .7], dtype=np.float32)
FCW_5MS2_PROBS_WIDTH = 5
FCW_3MS2_PROBS_WIDTH = 2
DISENGAGE_WIDTH = 5
POSE_WIDTH = 6
WIDE_FROM_DEVICE_WIDTH = 3
SIM_POSE_WIDTH = 6
LEAD_WIDTH = 4
LANE_LINES_WIDTH = 2
ROAD_EDGES_WIDTH = 2
PLAN_WIDTH = 15
DESIRE_PRED_WIDTH = 8
LAT_PLANNER_SOLUTION_WIDTH = 4
DESIRED_CURV_WIDTH = 1
NUM_LANE_LINES = 4
NUM_ROAD_EDGES = 2
LEAD_TRAJ_LEN = 6
DESIRE_PRED_LEN = 4
PLAN_MHP_N = 5
LEAD_MHP_N = 2
PLAN_MHP_SELECTION = 1
LEAD_MHP_SELECTION = 3
FCW_THRESHOLD_5MS2_HIGH = 0.15
FCW_THRESHOLD_5MS2_LOW = 0.05
FCW_THRESHOLD_3MS2 = 0.7
CONFIDENCE_BUFFER_LEN = 5
RYG_GREEN = 0.01165
RYG_YELLOW = 0.06157
POLY_PATH_DEGREE = 4
# model outputs slices
class Plan:
POSITION = slice(0, 3)
VELOCITY = slice(3, 6)
ACCELERATION = slice(6, 9)
T_FROM_CURRENT_EULER = slice(9, 12)
ORIENTATION_RATE = slice(12, 15)
class Meta:
ENGAGED = slice(0, 1)
# next 2, 4, 6, 8, 10 seconds
GAS_DISENGAGE = slice(1, 31, 6)
BRAKE_DISENGAGE = slice(2, 31, 6)
STEER_OVERRIDE = slice(3, 31, 6)
HARD_BRAKE_3 = slice(4, 31, 6)
HARD_BRAKE_4 = slice(5, 31, 6)
HARD_BRAKE_5 = slice(6, 31, 6)
# next 0, 2, 4, 6, 8, 10 seconds
GAS_PRESS = slice(31, 55, 4)
BRAKE_PRESS = slice(32, 55, 4)
LEFT_BLINKER = slice(33, 55, 4)
RIGHT_BLINKER = slice(34, 55, 4)

View File

@@ -0,0 +1,237 @@
import os
import capnp
import numpy as np
from cereal import log
from openpilot.sunnypilot.modeld.constants import ModelConstants, Plan, Meta
from openpilot.selfdrive.controls.lib.drive_helpers import MIN_SPEED
SEND_RAW_PRED = os.getenv('SEND_RAW_PRED')
ConfidenceClass = log.ModelDataV2.ConfidenceClass
def curv_from_psis(psi_target, psi_rate, vego, delay):
vego = np.clip(vego, MIN_SPEED, np.inf)
curv_from_psi = psi_target / (vego * delay) # epsilon to prevent divide-by-zero
return 2*curv_from_psi - psi_rate / vego
def get_curvature_from_plan(plan, vego, delay):
psi_target = np.interp(delay, ModelConstants.T_IDXS, plan[:, Plan.T_FROM_CURRENT_EULER][:, 2])
psi_rate = plan[:, Plan.ORIENTATION_RATE][0, 2]
return curv_from_psis(psi_target, psi_rate, vego, delay)
class PublishState:
def __init__(self):
self.disengage_buffer = np.zeros(ModelConstants.CONFIDENCE_BUFFER_LEN*ModelConstants.DISENGAGE_WIDTH, dtype=np.float32)
self.prev_brake_5ms2_probs = np.zeros(ModelConstants.FCW_5MS2_PROBS_WIDTH, dtype=np.float32)
self.prev_brake_3ms2_probs = np.zeros(ModelConstants.FCW_3MS2_PROBS_WIDTH, dtype=np.float32)
def fill_xyzt(builder, t, x, y, z, x_std=None, y_std=None, z_std=None):
builder.t = t
builder.x = x.tolist()
builder.y = y.tolist()
builder.z = z.tolist()
if x_std is not None:
builder.xStd = x_std.tolist()
if y_std is not None:
builder.yStd = y_std.tolist()
if z_std is not None:
builder.zStd = z_std.tolist()
def fill_xyvat(builder, t, x, y, v, a, x_std=None, y_std=None, v_std=None, a_std=None):
builder.t = t
builder.x = x.tolist()
builder.y = y.tolist()
builder.v = v.tolist()
builder.a = a.tolist()
if x_std is not None:
builder.xStd = x_std.tolist()
if y_std is not None:
builder.yStd = y_std.tolist()
if v_std is not None:
builder.vStd = v_std.tolist()
if a_std is not None:
builder.aStd = a_std.tolist()
def fill_xyz_poly(builder, degree, x, y, z):
xyz = np.stack([x, y, z], axis=1)
coeffs = np.polynomial.polynomial.polyfit(ModelConstants.T_IDXS, xyz, deg=degree)
builder.xCoefficients = coeffs[:, 0].tolist()
builder.yCoefficients = coeffs[:, 1].tolist()
builder.zCoefficients = coeffs[:, 2].tolist()
def fill_lane_line_meta(builder, lane_lines, lane_line_probs):
builder.leftY = lane_lines[1].y[0]
builder.leftProb = lane_line_probs[1]
builder.rightY = lane_lines[2].y[0]
builder.rightProb = lane_line_probs[2]
def fill_model_msg(base_msg: capnp._DynamicStructBuilder, extended_msg: capnp._DynamicStructBuilder,
net_output_data: dict[str, np.ndarray], v_ego: float, delay: float,
publish_state: PublishState, vipc_frame_id: int, vipc_frame_id_extra: int,
frame_id: int, frame_drop: float, timestamp_eof: int, model_execution_time: float,
valid: bool) -> None:
frame_age = frame_id - vipc_frame_id if frame_id > vipc_frame_id else 0
frame_drop_perc = frame_drop * 100
extended_msg.valid = valid
base_msg.valid = valid
desired_curv = float(get_curvature_from_plan(net_output_data['plan'][0], v_ego, delay))
driving_model_data = base_msg.drivingModelData
driving_model_data.frameId = vipc_frame_id
driving_model_data.frameIdExtra = vipc_frame_id_extra
driving_model_data.frameDropPerc = frame_drop_perc
driving_model_data.modelExecutionTime = model_execution_time
action = driving_model_data.action
action.desiredCurvature = desired_curv
modelV2 = extended_msg.modelV2
modelV2.frameId = vipc_frame_id
modelV2.frameIdExtra = vipc_frame_id_extra
modelV2.frameAge = frame_age
modelV2.frameDropPerc = frame_drop_perc
modelV2.timestampEof = timestamp_eof
modelV2.modelExecutionTime = model_execution_time
# plan
position = modelV2.position
fill_xyzt(position, ModelConstants.T_IDXS, *net_output_data['plan'][0,:,Plan.POSITION].T, *net_output_data['plan_stds'][0,:,Plan.POSITION].T)
velocity = modelV2.velocity
fill_xyzt(velocity, ModelConstants.T_IDXS, *net_output_data['plan'][0,:,Plan.VELOCITY].T)
acceleration = modelV2.acceleration
fill_xyzt(acceleration, ModelConstants.T_IDXS, *net_output_data['plan'][0,:,Plan.ACCELERATION].T)
orientation = modelV2.orientation
fill_xyzt(orientation, ModelConstants.T_IDXS, *net_output_data['plan'][0,:,Plan.T_FROM_CURRENT_EULER].T)
orientation_rate = modelV2.orientationRate
fill_xyzt(orientation_rate, ModelConstants.T_IDXS, *net_output_data['plan'][0,:,Plan.ORIENTATION_RATE].T)
# temporal pose
temporal_pose = modelV2.temporalPose
temporal_pose.trans = net_output_data['plan'][0,0,Plan.VELOCITY].tolist()
temporal_pose.transStd = net_output_data['plan_stds'][0,0,Plan.VELOCITY].tolist()
temporal_pose.rot = net_output_data['plan'][0,0,Plan.ORIENTATION_RATE].tolist()
temporal_pose.rotStd = net_output_data['plan_stds'][0,0,Plan.ORIENTATION_RATE].tolist()
# poly path
poly_path = driving_model_data.path
fill_xyz_poly(poly_path, ModelConstants.POLY_PATH_DEGREE, *net_output_data['plan'][0,:,Plan.POSITION].T)
# lateral planning
action = modelV2.action
action.desiredCurvature = desired_curv
# times at X_IDXS according to model plan
PLAN_T_IDXS = [np.nan] * ModelConstants.IDX_N
PLAN_T_IDXS[0] = 0.0
plan_x = net_output_data['plan'][0,:,Plan.POSITION][:,0].tolist()
for xidx in range(1, ModelConstants.IDX_N):
tidx = 0
# increment tidx until we find an element that's further away than the current xidx
while tidx < ModelConstants.IDX_N - 1 and plan_x[tidx+1] < ModelConstants.X_IDXS[xidx]:
tidx += 1
if tidx == ModelConstants.IDX_N - 1:
# if the Plan doesn't extend far enough, set plan_t to the max value (10s), then break
PLAN_T_IDXS[xidx] = ModelConstants.T_IDXS[ModelConstants.IDX_N - 1]
break
# interpolate to find `t` for the current xidx
current_x_val = plan_x[tidx]
next_x_val = plan_x[tidx+1]
p = (ModelConstants.X_IDXS[xidx] - current_x_val) / (next_x_val - current_x_val) if abs(next_x_val - current_x_val) > 1e-9 else float('nan')
PLAN_T_IDXS[xidx] = p * ModelConstants.T_IDXS[tidx+1] + (1 - p) * ModelConstants.T_IDXS[tidx]
# lane lines
modelV2.init('laneLines', 4)
for i in range(4):
lane_line = modelV2.laneLines[i]
fill_xyzt(lane_line, PLAN_T_IDXS, np.array(ModelConstants.X_IDXS), net_output_data['lane_lines'][0,i,:,0], net_output_data['lane_lines'][0,i,:,1])
modelV2.laneLineStds = net_output_data['lane_lines_stds'][0,:,0,0].tolist()
modelV2.laneLineProbs = net_output_data['lane_lines_prob'][0,1::2].tolist()
lane_line_meta = driving_model_data.laneLineMeta
fill_lane_line_meta(lane_line_meta, modelV2.laneLines, modelV2.laneLineProbs)
# road edges
modelV2.init('roadEdges', 2)
for i in range(2):
road_edge = modelV2.roadEdges[i]
fill_xyzt(road_edge, PLAN_T_IDXS, np.array(ModelConstants.X_IDXS), net_output_data['road_edges'][0,i,:,0], net_output_data['road_edges'][0,i,:,1])
modelV2.roadEdgeStds = net_output_data['road_edges_stds'][0,:,0,0].tolist()
# leads
modelV2.init('leadsV3', 3)
for i in range(3):
lead = modelV2.leadsV3[i]
fill_xyvat(lead, ModelConstants.LEAD_T_IDXS, *net_output_data['lead'][0,i].T, *net_output_data['lead_stds'][0,i].T)
lead.prob = net_output_data['lead_prob'][0,i].tolist()
lead.probTime = ModelConstants.LEAD_T_OFFSETS[i]
# meta
meta = modelV2.meta
meta.desireState = net_output_data['desire_state'][0].reshape(-1).tolist()
meta.desirePrediction = net_output_data['desire_pred'][0].reshape(-1).tolist()
meta.engagedProb = net_output_data['meta'][0,Meta.ENGAGED].item()
meta.init('disengagePredictions')
disengage_predictions = meta.disengagePredictions
disengage_predictions.t = ModelConstants.META_T_IDXS
disengage_predictions.brakeDisengageProbs = net_output_data['meta'][0,Meta.BRAKE_DISENGAGE].tolist()
disengage_predictions.gasDisengageProbs = net_output_data['meta'][0,Meta.GAS_DISENGAGE].tolist()
disengage_predictions.steerOverrideProbs = net_output_data['meta'][0,Meta.STEER_OVERRIDE].tolist()
disengage_predictions.brake3MetersPerSecondSquaredProbs = net_output_data['meta'][0,Meta.HARD_BRAKE_3].tolist()
disengage_predictions.brake4MetersPerSecondSquaredProbs = net_output_data['meta'][0,Meta.HARD_BRAKE_4].tolist()
disengage_predictions.brake5MetersPerSecondSquaredProbs = net_output_data['meta'][0,Meta.HARD_BRAKE_5].tolist()
disengage_predictions.gasPressProbs = net_output_data['meta'][0,Meta.GAS_PRESS].tolist()
disengage_predictions.brakePressProbs = net_output_data['meta'][0,Meta.BRAKE_PRESS].tolist()
publish_state.prev_brake_5ms2_probs[:-1] = publish_state.prev_brake_5ms2_probs[1:]
publish_state.prev_brake_5ms2_probs[-1] = net_output_data['meta'][0,Meta.HARD_BRAKE_5][0]
publish_state.prev_brake_3ms2_probs[:-1] = publish_state.prev_brake_3ms2_probs[1:]
publish_state.prev_brake_3ms2_probs[-1] = net_output_data['meta'][0,Meta.HARD_BRAKE_3][0]
hard_brake_predicted = (publish_state.prev_brake_5ms2_probs > ModelConstants.FCW_THRESHOLDS_5MS2).all() and \
(publish_state.prev_brake_3ms2_probs > ModelConstants.FCW_THRESHOLDS_3MS2).all()
meta.hardBrakePredicted = hard_brake_predicted.item()
# confidence
if vipc_frame_id % (2*ModelConstants.MODEL_FREQ) == 0:
# any disengage prob
brake_disengage_probs = net_output_data['meta'][0,Meta.BRAKE_DISENGAGE]
gas_disengage_probs = net_output_data['meta'][0,Meta.GAS_DISENGAGE]
steer_override_probs = net_output_data['meta'][0,Meta.STEER_OVERRIDE]
any_disengage_probs = 1-((1-brake_disengage_probs)*(1-gas_disengage_probs)*(1-steer_override_probs))
# independent disengage prob for each 2s slice
ind_disengage_probs = np.r_[any_disengage_probs[0], np.diff(any_disengage_probs) / (1 - any_disengage_probs[:-1])]
# rolling buf for 2, 4, 6, 8, 10s
publish_state.disengage_buffer[:-ModelConstants.DISENGAGE_WIDTH] = publish_state.disengage_buffer[ModelConstants.DISENGAGE_WIDTH:]
publish_state.disengage_buffer[-ModelConstants.DISENGAGE_WIDTH:] = ind_disengage_probs
score = 0.
for i in range(ModelConstants.DISENGAGE_WIDTH):
score += publish_state.disengage_buffer[i*ModelConstants.DISENGAGE_WIDTH+ModelConstants.DISENGAGE_WIDTH-1-i].item() / ModelConstants.DISENGAGE_WIDTH
if score < ModelConstants.RYG_GREEN:
modelV2.confidence = ConfidenceClass.green
elif score < ModelConstants.RYG_YELLOW:
modelV2.confidence = ConfidenceClass.yellow
else:
modelV2.confidence = ConfidenceClass.red
# raw prediction if enabled
if SEND_RAW_PRED:
modelV2.rawPredictions = net_output_data['raw_pred'].tobytes()
def fill_pose_msg(msg: capnp._DynamicStructBuilder, net_output_data: dict[str, np.ndarray],
vipc_frame_id: int, vipc_dropped_frames: int, timestamp_eof: int, live_calib_seen: bool) -> None:
msg.valid = live_calib_seen & (vipc_dropped_frames < 1)
cameraOdometry = msg.cameraOdometry
cameraOdometry.frameId = vipc_frame_id
cameraOdometry.timestampEof = timestamp_eof
cameraOdometry.trans = net_output_data['pose'][0,:3].tolist()
cameraOdometry.rot = net_output_data['pose'][0,3:].tolist()
cameraOdometry.wideFromDeviceEuler = net_output_data['wide_from_device_euler'][0,:].tolist()
cameraOdometry.roadTransformTrans = net_output_data['road_transform'][0,:3].tolist()
cameraOdometry.transStd = net_output_data['pose_stds'][0,:3].tolist()
cameraOdometry.rotStd = net_output_data['pose_stds'][0,3:].tolist()
cameraOdometry.wideFromDeviceEulerStd = net_output_data['wide_from_device_euler_stds'][0,:].tolist()
cameraOdometry.roadTransformTransStd = net_output_data['road_transform_stds'][0,:3].tolist()

View File

@@ -0,0 +1,28 @@
#!/usr/bin/env python3
import sys
import pathlib
import onnx
import codecs
import pickle
def get_name_and_shape(value_info:onnx.ValueInfoProto) -> tuple[str, tuple[int,...]]:
shape = tuple([int(dim.dim_value) for dim in value_info.type.tensor_type.shape.dim])
name = value_info.name
return name, shape
if __name__ == "__main__":
model_path = pathlib.Path(sys.argv[1])
model = onnx.load(str(model_path))
i = [x.key for x in model.metadata_props].index('output_slices')
output_slices = model.metadata_props[i].value
metadata = {}
metadata['output_slices'] = pickle.loads(codecs.decode(output_slices.encode(), "base64"))
metadata['input_shapes'] = dict([get_name_and_shape(x) for x in model.graph.input])
metadata['output_shapes'] = dict([get_name_and_shape(x) for x in model.graph.output])
metadata_path = model_path.parent / (model_path.stem + '_metadata.pkl')
with open(metadata_path, 'wb') as f:
pickle.dump(metadata, f)
print(f'saved metadata to {metadata_path}')

10
sunnypilot/modeld/modeld Executable file
View File

@@ -0,0 +1,10 @@
#!/usr/bin/env bash
DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null && pwd)"
cd "$DIR/../../"
if [ -f "$DIR/libthneed.so" ]; then
export LD_PRELOAD="$DIR/libthneed.so"
fi
exec "$DIR/modeld.py" "$@"

299
sunnypilot/modeld/modeld.py Executable file
View File

@@ -0,0 +1,299 @@
#!/usr/bin/env python3
import os
import time
import pickle
import numpy as np
import cereal.messaging as messaging
from cereal import car, log
from pathlib import Path
from setproctitle import setproctitle
from cereal.messaging import PubMaster, SubMaster
from msgq.visionipc import VisionIpcClient, VisionStreamType, VisionBuf
from opendbc.car.car_helpers import get_demo_car_params
from openpilot.common.swaglog import cloudlog
from openpilot.common.params import Params
from openpilot.common.filter_simple import FirstOrderFilter
from openpilot.common.realtime import config_realtime_process
from openpilot.common.transformations.camera import DEVICE_CAMERAS
from openpilot.common.transformations.model import get_warp_matrix
from openpilot.system import sentry
from openpilot.selfdrive.controls.lib.desire_helper import DesireHelper
from openpilot.sunnypilot.modeld.runners import ModelRunner, Runtime
from openpilot.sunnypilot.modeld.parse_model_outputs import Parser
from openpilot.sunnypilot.modeld.fill_model_msg import fill_model_msg, fill_pose_msg, PublishState
from openpilot.sunnypilot.modeld.constants import ModelConstants
from openpilot.sunnypilot.modeld.models.commonmodel_pyx import DrivingModelFrame, CLContext
PROCESS_NAME = "sunnypilot.modeld.modeld"
SEND_RAW_PRED = os.getenv('SEND_RAW_PRED')
MODEL_PATHS = {
ModelRunner.THNEED: Path(__file__).parent / 'models/supercombo.thneed',
ModelRunner.ONNX: Path(__file__).parent / 'models/supercombo.onnx'}
METADATA_PATH = Path(__file__).parent / 'models/supercombo_metadata.pkl'
class FrameMeta:
frame_id: int = 0
timestamp_sof: int = 0
timestamp_eof: int = 0
def __init__(self, vipc=None):
if vipc is not None:
self.frame_id, self.timestamp_sof, self.timestamp_eof = vipc.frame_id, vipc.timestamp_sof, vipc.timestamp_eof
class ModelState:
frame: DrivingModelFrame
wide_frame: DrivingModelFrame
inputs: dict[str, np.ndarray]
output: np.ndarray
prev_desire: np.ndarray # for tracking the rising edge of the pulse
model: ModelRunner
def __init__(self, context: CLContext):
self.frame = DrivingModelFrame(context)
self.wide_frame = DrivingModelFrame(context)
self.prev_desire = np.zeros(ModelConstants.DESIRE_LEN, dtype=np.float32)
self.full_features_20Hz = np.zeros((ModelConstants.FULL_HISTORY_BUFFER_LEN, ModelConstants.FEATURE_LEN), dtype=np.float32)
self.desire_20Hz = np.zeros((ModelConstants.FULL_HISTORY_BUFFER_LEN + 1, ModelConstants.DESIRE_LEN), dtype=np.float32)
# img buffers are managed in openCL transform code
self.inputs = {
'desire': np.zeros(ModelConstants.DESIRE_LEN * (ModelConstants.HISTORY_BUFFER_LEN+1), dtype=np.float32),
'traffic_convention': np.zeros(ModelConstants.TRAFFIC_CONVENTION_LEN, dtype=np.float32),
'features_buffer': np.zeros(ModelConstants.HISTORY_BUFFER_LEN * ModelConstants.FEATURE_LEN, dtype=np.float32),
}
with open(METADATA_PATH, 'rb') as f:
model_metadata = pickle.load(f)
self.output_slices = model_metadata['output_slices']
net_output_size = model_metadata['output_shapes']['outputs'][1]
self.output = np.zeros(net_output_size, dtype=np.float32)
self.parser = Parser()
self.model = ModelRunner(MODEL_PATHS, self.output, Runtime.GPU, False, context)
self.model.addInput("input_imgs", None)
self.model.addInput("big_input_imgs", None)
for k,v in self.inputs.items():
self.model.addInput(k, v)
def slice_outputs(self, model_outputs: np.ndarray) -> dict[str, np.ndarray]:
parsed_model_outputs = {k: model_outputs[np.newaxis, v] for k,v in self.output_slices.items()}
if SEND_RAW_PRED:
parsed_model_outputs['raw_pred'] = model_outputs.copy()
return parsed_model_outputs
def run(self, buf: VisionBuf, wbuf: VisionBuf, transform: np.ndarray, transform_wide: np.ndarray,
inputs: dict[str, np.ndarray], prepare_only: bool) -> dict[str, np.ndarray] | None:
# Model decides when action is completed, so desire input is just a pulse triggered on rising edge
inputs['desire'][0] = 0
new_desire = np.where(inputs['desire'] - self.prev_desire > .99, inputs['desire'], 0)
self.prev_desire[:] = inputs['desire']
self.desire_20Hz[:-1] = self.desire_20Hz[1:]
self.desire_20Hz[-1] = new_desire
self.inputs['desire'][:] = self.desire_20Hz.reshape((25,4,-1)).max(axis=1).flatten()
self.inputs['traffic_convention'][:] = inputs['traffic_convention']
self.model.setInputBuffer("input_imgs", self.frame.prepare(buf, transform.flatten(), self.model.getCLBuffer("input_imgs")))
self.model.setInputBuffer("big_input_imgs", self.wide_frame.prepare(wbuf, transform_wide.flatten(), self.model.getCLBuffer("big_input_imgs")))
if prepare_only:
return None
self.model.execute()
outputs = self.parser.parse_outputs(self.slice_outputs(self.output))
self.full_features_20Hz[:-1] = self.full_features_20Hz[1:]
self.full_features_20Hz[-1] = outputs['hidden_state'][0, :]
idxs = np.arange(-4,-100,-4)[::-1]
self.inputs['features_buffer'][:] = self.full_features_20Hz[idxs].flatten()
return outputs
def main(demo=False):
cloudlog.warning("modeld init")
sentry.set_tag("daemon", PROCESS_NAME)
cloudlog.bind(daemon=PROCESS_NAME)
setproctitle(PROCESS_NAME)
config_realtime_process(7, 54)
cloudlog.warning("setting up CL context")
cl_context = CLContext()
cloudlog.warning("CL context ready; loading model")
model = ModelState(cl_context)
cloudlog.warning("models loaded, modeld starting")
# visionipc clients
while True:
available_streams = VisionIpcClient.available_streams("camerad", block=False)
if available_streams:
use_extra_client = VisionStreamType.VISION_STREAM_WIDE_ROAD in available_streams and VisionStreamType.VISION_STREAM_ROAD in available_streams
main_wide_camera = VisionStreamType.VISION_STREAM_ROAD not in available_streams
break
time.sleep(.1)
vipc_client_main_stream = VisionStreamType.VISION_STREAM_WIDE_ROAD if main_wide_camera else VisionStreamType.VISION_STREAM_ROAD
vipc_client_main = VisionIpcClient("camerad", vipc_client_main_stream, True, cl_context)
vipc_client_extra = VisionIpcClient("camerad", VisionStreamType.VISION_STREAM_WIDE_ROAD, False, cl_context)
cloudlog.warning(f"vision stream set up, main_wide_camera: {main_wide_camera}, use_extra_client: {use_extra_client}")
while not vipc_client_main.connect(False):
time.sleep(0.1)
while use_extra_client and not vipc_client_extra.connect(False):
time.sleep(0.1)
cloudlog.warning(f"connected main cam with buffer size: {vipc_client_main.buffer_len} ({vipc_client_main.width} x {vipc_client_main.height})")
if use_extra_client:
cloudlog.warning(f"connected extra cam with buffer size: {vipc_client_extra.buffer_len} ({vipc_client_extra.width} x {vipc_client_extra.height})")
# messaging
pm = PubMaster(["modelV2", "drivingModelData", "cameraOdometry"])
sm = SubMaster(["deviceState", "carState", "roadCameraState", "liveCalibration", "driverMonitoringState", "carControl"])
publish_state = PublishState()
params = Params()
# setup filter to track dropped frames
frame_dropped_filter = FirstOrderFilter(0., 10., 1. / ModelConstants.MODEL_FREQ)
frame_id = 0
last_vipc_frame_id = 0
run_count = 0
model_transform_main = np.zeros((3, 3), dtype=np.float32)
model_transform_extra = np.zeros((3, 3), dtype=np.float32)
live_calib_seen = False
buf_main, buf_extra = None, None
meta_main = FrameMeta()
meta_extra = FrameMeta()
if demo:
CP = get_demo_car_params()
else:
CP = messaging.log_from_bytes(params.get("CarParams", block=True), car.CarParams)
cloudlog.info("modeld got CarParams: %s", CP.carName)
# TODO this needs more thought, use .2s extra for now to estimate other delays
steer_delay = CP.steerActuatorDelay + .2
DH = DesireHelper()
while True:
# Keep receiving frames until we are at least 1 frame ahead of previous extra frame
while meta_main.timestamp_sof < meta_extra.timestamp_sof + 25000000:
buf_main = vipc_client_main.recv()
meta_main = FrameMeta(vipc_client_main)
if buf_main is None:
break
if buf_main is None:
cloudlog.debug("vipc_client_main no frame")
continue
if use_extra_client:
# Keep receiving extra frames until frame id matches main camera
while True:
buf_extra = vipc_client_extra.recv()
meta_extra = FrameMeta(vipc_client_extra)
if buf_extra is None or meta_main.timestamp_sof < meta_extra.timestamp_sof + 25000000:
break
if buf_extra is None:
cloudlog.debug("vipc_client_extra no frame")
continue
if abs(meta_main.timestamp_sof - meta_extra.timestamp_sof) > 10000000:
cloudlog.error(f"frames out of sync! main: {meta_main.frame_id} ({meta_main.timestamp_sof / 1e9:.5f}),\
extra: {meta_extra.frame_id} ({meta_extra.timestamp_sof / 1e9:.5f})")
else:
# Use single camera
buf_extra = buf_main
meta_extra = meta_main
sm.update(0)
desire = DH.desire
is_rhd = sm["driverMonitoringState"].isRHD
frame_id = sm["roadCameraState"].frameId
v_ego = max(sm["carState"].vEgo, 0.)
if sm.updated["liveCalibration"] and sm.seen['roadCameraState'] and sm.seen['deviceState']:
device_from_calib_euler = np.array(sm["liveCalibration"].rpyCalib, dtype=np.float32)
dc = DEVICE_CAMERAS[(str(sm['deviceState'].deviceType), str(sm['roadCameraState'].sensor))]
model_transform_main = get_warp_matrix(device_from_calib_euler, dc.ecam.intrinsics if main_wide_camera else dc.fcam.intrinsics, False).astype(np.float32)
model_transform_extra = get_warp_matrix(device_from_calib_euler, dc.ecam.intrinsics, True).astype(np.float32)
live_calib_seen = True
traffic_convention = np.zeros(2)
traffic_convention[int(is_rhd)] = 1
vec_desire = np.zeros(ModelConstants.DESIRE_LEN, dtype=np.float32)
if desire >= 0 and desire < ModelConstants.DESIRE_LEN:
vec_desire[desire] = 1
# tracked dropped frames
vipc_dropped_frames = max(0, meta_main.frame_id - last_vipc_frame_id - 1)
frames_dropped = frame_dropped_filter.update(min(vipc_dropped_frames, 10))
if run_count < 10: # let frame drops warm up
frame_dropped_filter.x = 0.
frames_dropped = 0.
run_count = run_count + 1
frame_drop_ratio = frames_dropped / (1 + frames_dropped)
prepare_only = vipc_dropped_frames > 0
if prepare_only:
cloudlog.error(f"skipping model eval. Dropped {vipc_dropped_frames} frames")
inputs:dict[str, np.ndarray] = {
'desire': vec_desire,
'traffic_convention': traffic_convention,
}
mt1 = time.perf_counter()
model_output = model.run(buf_main, buf_extra, model_transform_main, model_transform_extra, inputs, prepare_only)
mt2 = time.perf_counter()
model_execution_time = mt2 - mt1
if model_output is not None:
modelv2_send = messaging.new_message('modelV2')
drivingdata_send = messaging.new_message('drivingModelData')
posenet_send = messaging.new_message('cameraOdometry')
fill_model_msg(drivingdata_send, modelv2_send, model_output, v_ego, steer_delay,
publish_state, meta_main.frame_id, meta_extra.frame_id, frame_id,
frame_drop_ratio, meta_main.timestamp_eof, model_execution_time, live_calib_seen)
desire_state = modelv2_send.modelV2.meta.desireState
l_lane_change_prob = desire_state[log.Desire.laneChangeLeft]
r_lane_change_prob = desire_state[log.Desire.laneChangeRight]
lane_change_prob = l_lane_change_prob + r_lane_change_prob
DH.update(sm['carState'], sm['carControl'].latActive, lane_change_prob)
modelv2_send.modelV2.meta.laneChangeState = DH.lane_change_state
modelv2_send.modelV2.meta.laneChangeDirection = DH.lane_change_direction
drivingdata_send.drivingModelData.meta.laneChangeState = DH.lane_change_state
drivingdata_send.drivingModelData.meta.laneChangeDirection = DH.lane_change_direction
fill_pose_msg(posenet_send, model_output, meta_main.frame_id, vipc_dropped_frames, meta_main.timestamp_eof, live_calib_seen)
pm.send('modelV2', modelv2_send)
pm.send('drivingModelData', drivingdata_send)
pm.send('cameraOdometry', posenet_send)
last_vipc_frame_id = meta_main.frame_id
if __name__ == "__main__":
try:
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--demo', action='store_true', help='A boolean for demo mode.')
args = parser.parse_args()
main(demo=args.demo)
except KeyboardInterrupt:
cloudlog.warning(f"child {PROCESS_NAME} got SIGINT")
except Exception:
sentry.capture_exception()
raise

View File

@@ -0,0 +1,62 @@
## Neural networks in openpilot
To view the architecture of the ONNX networks, you can use [netron](https://netron.app/)
## Supercombo
### Supercombo input format (Full size: 799906 x float32)
* **image stream**
* Two consecutive images (256 * 512 * 3 in RGB) recorded at 20 Hz : 393216 = 2 * 6 * 128 * 256
* Each 256 * 512 image is represented in YUV420 with 6 channels : 6 * 128 * 256
* Channels 0,1,2,3 represent the full-res Y channel and are represented in numpy as Y[::2, ::2], Y[::2, 1::2], Y[1::2, ::2], and Y[1::2, 1::2]
* Channel 4 represents the half-res U channel
* Channel 5 represents the half-res V channel
* **wide image stream**
* Two consecutive images (256 * 512 * 3 in RGB) recorded at 20 Hz : 393216 = 2 * 6 * 128 * 256
* Each 256 * 512 image is represented in YUV420 with 6 channels : 6 * 128 * 256
* Channels 0,1,2,3 represent the full-res Y channel and are represented in numpy as Y[::2, ::2], Y[::2, 1::2], Y[1::2, ::2], and Y[1::2, 1::2]
* Channel 4 represents the half-res U channel
* Channel 5 represents the half-res V channel
* **desire**
* one-hot encoded buffer to command model to execute certain actions, bit needs to be sent for the past 5 seconds (at 20FPS) : 100 * 8
* **traffic convention**
* one-hot encoded vector to tell model whether traffic is right-hand or left-hand traffic : 2
* **feature buffer**
* A buffer of intermediate features that gets appended to the current feature to form a 5 seconds temporal context (at 20FPS) : 99 * 512
### Supercombo output format (Full size: XXX x float32)
Read [here](https://github.com/commaai/openpilot/blob/90af436a121164a51da9fa48d093c29f738adf6a/selfdrive/modeld/models/driving.h#L236) for more.
## Driver Monitoring Model
* .onnx model can be run with onnx runtimes
* .dlc file is a pre-quantized model and only runs on qualcomm DSPs
### input format
* single image W = 1440 H = 960 luminance channel (Y) from the planar YUV420 format:
* full input size is 1440 * 960 = 1382400
* normalized ranging from 0.0 to 1.0 in float32 (onnx runner) or ranging from 0 to 255 in uint8 (snpe runner)
* camera calibration angles (roll, pitch, yaw) from liveCalibration: 3 x float32 inputs
### output format
* 84 x float32 outputs = 2 + 41 * 2 ([parsing example](https://github.com/commaai/openpilot/blob/22ce4e17ba0d3bfcf37f8255a4dd1dc683fe0c38/selfdrive/modeld/models/dmonitoring.cc#L33))
* for each person in the front seats (2 * 41)
* face pose: 12 = 6 + 6
* face orientation [pitch, yaw, roll] in camera frame: 3
* face position [dx, dy] relative to image center: 2
* normalized face size: 1
* standard deviations for above outputs: 6
* face visible probability: 1
* eyes: 20 = (8 + 1) + (8 + 1) + 1 + 1
* eye position and size, and their standard deviations: 8
* eye visible probability: 1
* eye closed probability: 1
* wearing sunglasses probability: 1
* face occluded probability: 1
* touching wheel probability: 1
* paying attention probability: 1
* (deprecated) distracted probabilities: 2
* using phone probability: 1
* distracted probability: 1
* common outputs 2
* poor camera vision probability: 1
* left hand drive probability: 1

View File

@@ -0,0 +1,69 @@
#include "sunnypilot/modeld/models/commonmodel.h"
#include <cmath>
#include <cstring>
#include "common/clutil.h"
DrivingModelFrame::DrivingModelFrame(cl_device_id device_id, cl_context context) : ModelFrame(device_id, context) {
input_frames = std::make_unique<uint8_t[]>(buf_size);
//input_frames_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, buf_size, NULL, &err));
img_buffer_20hz_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, 5*frame_size_bytes, NULL, &err));
region.origin = 4 * frame_size_bytes;
region.size = frame_size_bytes;
last_img_cl = CL_CHECK_ERR(clCreateSubBuffer(img_buffer_20hz_cl, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, &region, &err));
loadyuv_init(&loadyuv, context, device_id, MODEL_WIDTH, MODEL_HEIGHT);
init_transform(device_id, context, MODEL_WIDTH, MODEL_HEIGHT);
}
uint8_t* DrivingModelFrame::prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection, cl_mem* output) {
run_transform(yuv_cl, MODEL_WIDTH, MODEL_HEIGHT, frame_width, frame_height, frame_stride, frame_uv_offset, projection);
for (int i = 0; i < 4; i++) {
CL_CHECK(clEnqueueCopyBuffer(q, img_buffer_20hz_cl, img_buffer_20hz_cl, (i+1)*frame_size_bytes, i*frame_size_bytes, frame_size_bytes, 0, nullptr, nullptr));
}
loadyuv_queue(&loadyuv, q, y_cl, u_cl, v_cl, last_img_cl);
if (output == NULL) {
CL_CHECK(clEnqueueReadBuffer(q, img_buffer_20hz_cl, CL_TRUE, 0, frame_size_bytes, &input_frames[0], 0, nullptr, nullptr));
CL_CHECK(clEnqueueReadBuffer(q, last_img_cl, CL_TRUE, 0, frame_size_bytes, &input_frames[MODEL_FRAME_SIZE], 0, nullptr, nullptr));
clFinish(q);
return &input_frames[0];
} else {
copy_queue(&loadyuv, q, img_buffer_20hz_cl, *output, 0, 0, frame_size_bytes);
copy_queue(&loadyuv, q, last_img_cl, *output, 0, frame_size_bytes, frame_size_bytes);
// NOTE: Since thneed is using a different command queue, this clFinish is needed to ensure the image is ready.
clFinish(q);
return NULL;
}
}
DrivingModelFrame::~DrivingModelFrame() {
deinit_transform();
loadyuv_destroy(&loadyuv);
CL_CHECK(clReleaseMemObject(img_buffer_20hz_cl));
CL_CHECK(clReleaseMemObject(last_img_cl));
CL_CHECK(clReleaseCommandQueue(q));
}
MonitoringModelFrame::MonitoringModelFrame(cl_device_id device_id, cl_context context) : ModelFrame(device_id, context) {
input_frames = std::make_unique<uint8_t[]>(buf_size);
//input_frame_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, buf_size, NULL, &err));
init_transform(device_id, context, MODEL_WIDTH, MODEL_HEIGHT);
}
uint8_t* MonitoringModelFrame::prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection, cl_mem* output) {
run_transform(yuv_cl, MODEL_WIDTH, MODEL_HEIGHT, frame_width, frame_height, frame_stride, frame_uv_offset, projection);
CL_CHECK(clEnqueueReadBuffer(q, y_cl, CL_TRUE, 0, MODEL_FRAME_SIZE * sizeof(uint8_t), input_frames.get(), 0, nullptr, nullptr));
clFinish(q);
//return &y_cl;
return input_frames.get();
}
MonitoringModelFrame::~MonitoringModelFrame() {
deinit_transform();
CL_CHECK(clReleaseCommandQueue(q));
}

View File

@@ -0,0 +1,98 @@
#pragma once
#include <cfloat>
#include <cstdlib>
#include <cassert>
#include <memory>
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
#ifdef __APPLE__
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
#include "common/mat.h"
#include "selfdrive/modeld/transforms/loadyuv.h"
#include "selfdrive/modeld/transforms/transform.h"
class ModelFrame {
public:
ModelFrame(cl_device_id device_id, cl_context context) {
q = CL_CHECK_ERR(clCreateCommandQueue(context, device_id, 0, &err));
}
virtual ~ModelFrame() {}
virtual uint8_t* prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection, cl_mem* output) { return NULL; }
/*
uint8_t* buffer_from_cl(cl_mem *in_frames, int buffer_size) {
CL_CHECK(clEnqueueReadBuffer(q, *in_frames, CL_TRUE, 0, buffer_size, input_frames.get(), 0, nullptr, nullptr));
clFinish(q);
return &input_frames[0];
}
*/
int MODEL_WIDTH;
int MODEL_HEIGHT;
int MODEL_FRAME_SIZE;
int buf_size;
protected:
cl_mem y_cl, u_cl, v_cl;
Transform transform;
cl_command_queue q;
std::unique_ptr<uint8_t[]> input_frames;
void init_transform(cl_device_id device_id, cl_context context, int model_width, int model_height) {
y_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, model_width * model_height, NULL, &err));
u_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, (model_width / 2) * (model_height / 2), NULL, &err));
v_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, (model_width / 2) * (model_height / 2), NULL, &err));
transform_init(&transform, context, device_id);
}
void deinit_transform() {
transform_destroy(&transform);
CL_CHECK(clReleaseMemObject(v_cl));
CL_CHECK(clReleaseMemObject(u_cl));
CL_CHECK(clReleaseMemObject(y_cl));
}
void run_transform(cl_mem yuv_cl, int model_width, int model_height, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection) {
transform_queue(&transform, q,
yuv_cl, frame_width, frame_height, frame_stride, frame_uv_offset,
y_cl, u_cl, v_cl, model_width, model_height, projection);
}
};
class DrivingModelFrame : public ModelFrame {
public:
DrivingModelFrame(cl_device_id device_id, cl_context context);
~DrivingModelFrame();
uint8_t* prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection, cl_mem* output);
const int MODEL_WIDTH = 512;
const int MODEL_HEIGHT = 256;
const int MODEL_FRAME_SIZE = MODEL_WIDTH * MODEL_HEIGHT * 3 / 2;
const int buf_size = MODEL_FRAME_SIZE * 2;
const size_t frame_size_bytes = MODEL_FRAME_SIZE * sizeof(uint8_t);
private:
LoadYUVState loadyuv;
cl_mem img_buffer_20hz_cl, last_img_cl;//, input_frames_cl;
cl_buffer_region region;
};
class MonitoringModelFrame : public ModelFrame {
public:
MonitoringModelFrame(cl_device_id device_id, cl_context context);
~MonitoringModelFrame();
uint8_t* prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3& projection, cl_mem* output);
const int MODEL_WIDTH = 1440;
const int MODEL_HEIGHT = 960;
const int MODEL_FRAME_SIZE = MODEL_WIDTH * MODEL_HEIGHT;
const int buf_size = MODEL_FRAME_SIZE;
private:
// cl_mem input_frame_cl;
};

View File

@@ -0,0 +1,26 @@
# distutils: language = c++
from msgq.visionipc.visionipc cimport cl_device_id, cl_context, cl_mem
cdef extern from "common/mat.h":
cdef struct mat3:
float v[9]
cdef extern from "common/clutil.h":
cdef unsigned long CL_DEVICE_TYPE_DEFAULT
cl_device_id cl_get_device_id(unsigned long)
cl_context cl_create_context(cl_device_id)
cdef extern from "sunnypilot/modeld/models/commonmodel.h":
cppclass ModelFrame:
int buf_size
# unsigned char * buffer_from_cl(cl_mem*, int);
unsigned char * prepare(cl_mem, int, int, int, int, mat3, cl_mem*)
cppclass DrivingModelFrame:
int buf_size
DrivingModelFrame(cl_device_id, cl_context)
cppclass MonitoringModelFrame:
int buf_size
MonitoringModelFrame(cl_device_id, cl_context)

View File

@@ -0,0 +1,13 @@
# distutils: language = c++
from msgq.visionipc.visionipc cimport cl_mem
from msgq.visionipc.visionipc_pyx cimport CLContext as BaseCLContext
cdef class CLContext(BaseCLContext):
pass
cdef class CLMem:
cdef cl_mem * mem
@staticmethod
cdef create(void*)

View File

@@ -0,0 +1,76 @@
# distutils: language = c++
# cython: c_string_encoding=ascii, language_level=3
import numpy as np
cimport numpy as cnp
from libc.string cimport memcpy
from libc.stdint cimport uintptr_t
from msgq.visionipc.visionipc cimport cl_mem
from msgq.visionipc.visionipc_pyx cimport VisionBuf, CLContext as BaseCLContext
from sunnypilot.modeld.models.commonmodel cimport CL_DEVICE_TYPE_DEFAULT, cl_get_device_id, cl_create_context
from sunnypilot.modeld.models.commonmodel cimport mat3, ModelFrame as cppModelFrame, DrivingModelFrame as cppDrivingModelFrame, MonitoringModelFrame as cppMonitoringModelFrame
cdef class CLContext(BaseCLContext):
def __cinit__(self):
self.device_id = cl_get_device_id(CL_DEVICE_TYPE_DEFAULT)
self.context = cl_create_context(self.device_id)
cdef class CLMem:
@staticmethod
cdef create(void * cmem):
mem = CLMem()
mem.mem = <cl_mem*> cmem
return mem
@property
def mem_address(self):
return <uintptr_t>(self.mem)
def cl_from_visionbuf(VisionBuf buf):
return CLMem.create(<void*>&buf.buf.buf_cl)
cdef class ModelFrame:
cdef cppModelFrame * frame
cdef int buf_size
def __dealloc__(self):
del self.frame
def prepare(self, VisionBuf buf, float[:] projection, CLMem output):
cdef mat3 cprojection
memcpy(cprojection.v, &projection[0], 9*sizeof(float))
cdef unsigned char * data
if output is None:
data = self.frame.prepare(buf.buf.buf_cl, buf.width, buf.height, buf.stride, buf.uv_offset, cprojection, NULL)
else:
data = self.frame.prepare(buf.buf.buf_cl, buf.width, buf.height, buf.stride, buf.uv_offset, cprojection, output.mem)
if not data:
return None
return np.asarray(<cnp.uint8_t[:self.buf_size]> data)
# return CLMem.create(data)
# def buffer_from_cl(self, CLMem in_frames):
# cdef unsigned char * data2
# data2 = self.frame.buffer_from_cl(in_frames.mem, self.buf_size)
# return np.asarray(<cnp.uint8_t[:self.buf_size]> data2)
cdef class DrivingModelFrame(ModelFrame):
cdef cppDrivingModelFrame * _frame
def __cinit__(self, CLContext context):
self._frame = new cppDrivingModelFrame(context.device_id, context.context)
self.frame = <cppModelFrame*>(self._frame)
self.buf_size = self._frame.buf_size
cdef class MonitoringModelFrame(ModelFrame):
cdef cppMonitoringModelFrame * _frame
def __cinit__(self, CLContext context):
self._frame = new cppMonitoringModelFrame(context.device_id, context.context)
self.frame = <cppModelFrame*>(self._frame)
self.buf_size = self._frame.buf_size

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@@ -0,0 +1,103 @@
import numpy as np
from openpilot.sunnypilot.modeld.constants import ModelConstants
def safe_exp(x, out=None):
# -11 is around 10**14, more causes float16 overflow
return np.exp(np.clip(x, -np.inf, 11), out=out)
def sigmoid(x):
return 1. / (1. + safe_exp(-x))
def softmax(x, axis=-1):
x -= np.max(x, axis=axis, keepdims=True)
if x.dtype == np.float32 or x.dtype == np.float64:
safe_exp(x, out=x)
else:
x = safe_exp(x)
x /= np.sum(x, axis=axis, keepdims=True)
return x
class Parser:
def __init__(self, ignore_missing=False):
self.ignore_missing = ignore_missing
def check_missing(self, outs, name):
if name not in outs and not self.ignore_missing:
raise ValueError(f"Missing output {name}")
return name not in outs
def parse_categorical_crossentropy(self, name, outs, out_shape=None):
if self.check_missing(outs, name):
return
raw = outs[name]
if out_shape is not None:
raw = raw.reshape((raw.shape[0],) + out_shape)
outs[name] = softmax(raw, axis=-1)
def parse_binary_crossentropy(self, name, outs):
if self.check_missing(outs, name):
return
raw = outs[name]
outs[name] = sigmoid(raw)
def parse_mdn(self, name, outs, in_N=0, out_N=1, out_shape=None):
if self.check_missing(outs, name):
return
raw = outs[name]
raw = raw.reshape((raw.shape[0], max(in_N, 1), -1))
n_values = (raw.shape[2] - out_N)//2
pred_mu = raw[:,:,:n_values]
pred_std = safe_exp(raw[:,:,n_values: 2*n_values])
if in_N > 1:
weights = np.zeros((raw.shape[0], in_N, out_N), dtype=raw.dtype)
for i in range(out_N):
weights[:,:,i - out_N] = softmax(raw[:,:,i - out_N], axis=-1)
if out_N == 1:
for fidx in range(weights.shape[0]):
idxs = np.argsort(weights[fidx][:,0])[::-1]
weights[fidx] = weights[fidx][idxs]
pred_mu[fidx] = pred_mu[fidx][idxs]
pred_std[fidx] = pred_std[fidx][idxs]
full_shape = tuple([raw.shape[0], in_N] + list(out_shape))
outs[name + '_weights'] = weights
outs[name + '_hypotheses'] = pred_mu.reshape(full_shape)
outs[name + '_stds_hypotheses'] = pred_std.reshape(full_shape)
pred_mu_final = np.zeros((raw.shape[0], out_N, n_values), dtype=raw.dtype)
pred_std_final = np.zeros((raw.shape[0], out_N, n_values), dtype=raw.dtype)
for fidx in range(weights.shape[0]):
for hidx in range(out_N):
idxs = np.argsort(weights[fidx,:,hidx])[::-1]
pred_mu_final[fidx, hidx] = pred_mu[fidx, idxs[0]]
pred_std_final[fidx, hidx] = pred_std[fidx, idxs[0]]
else:
pred_mu_final = pred_mu
pred_std_final = pred_std
if out_N > 1:
final_shape = tuple([raw.shape[0], out_N] + list(out_shape))
else:
final_shape = tuple([raw.shape[0],] + list(out_shape))
outs[name] = pred_mu_final.reshape(final_shape)
outs[name + '_stds'] = pred_std_final.reshape(final_shape)
def parse_outputs(self, outs: dict[str, np.ndarray]) -> dict[str, np.ndarray]:
self.parse_mdn('plan', outs, in_N=ModelConstants.PLAN_MHP_N, out_N=ModelConstants.PLAN_MHP_SELECTION,
out_shape=(ModelConstants.IDX_N,ModelConstants.PLAN_WIDTH))
self.parse_mdn('lane_lines', outs, in_N=0, out_N=0, out_shape=(ModelConstants.NUM_LANE_LINES,ModelConstants.IDX_N,ModelConstants.LANE_LINES_WIDTH))
self.parse_mdn('road_edges', outs, in_N=0, out_N=0, out_shape=(ModelConstants.NUM_ROAD_EDGES,ModelConstants.IDX_N,ModelConstants.LANE_LINES_WIDTH))
self.parse_mdn('pose', outs, in_N=0, out_N=0, out_shape=(ModelConstants.POSE_WIDTH,))
self.parse_mdn('road_transform', outs, in_N=0, out_N=0, out_shape=(ModelConstants.POSE_WIDTH,))
self.parse_mdn('wide_from_device_euler', outs, in_N=0, out_N=0, out_shape=(ModelConstants.WIDE_FROM_DEVICE_WIDTH,))
self.parse_mdn('lead', outs, in_N=ModelConstants.LEAD_MHP_N, out_N=ModelConstants.LEAD_MHP_SELECTION,
out_shape=(ModelConstants.LEAD_TRAJ_LEN,ModelConstants.LEAD_WIDTH))
if 'lat_planner_solution' in outs:
self.parse_mdn('lat_planner_solution', outs, in_N=0, out_N=0, out_shape=(ModelConstants.IDX_N,ModelConstants.LAT_PLANNER_SOLUTION_WIDTH))
for k in ['lead_prob', 'lane_lines_prob', 'meta']:
self.parse_binary_crossentropy(k, outs)
self.parse_categorical_crossentropy('desire_state', outs, out_shape=(ModelConstants.DESIRE_PRED_WIDTH,))
self.parse_categorical_crossentropy('desire_pred', outs, out_shape=(ModelConstants.DESIRE_PRED_LEN,ModelConstants.DESIRE_PRED_WIDTH))
return outs

View File

@@ -0,0 +1,27 @@
import os
from openpilot.system.hardware import TICI
from openpilot.sunnypilot.modeld.runners.runmodel_pyx import RunModel, Runtime
assert Runtime
USE_THNEED = int(os.getenv('USE_THNEED', str(int(TICI))))
USE_SNPE = int(os.getenv('USE_SNPE', str(int(TICI))))
class ModelRunner(RunModel):
THNEED = 'THNEED'
SNPE = 'SNPE'
ONNX = 'ONNX'
def __new__(cls, paths, *args, **kwargs):
if ModelRunner.THNEED in paths and USE_THNEED:
from openpilot.sunnypilot.modeld.runners.thneedmodel_pyx import ThneedModel as Runner
runner_type = ModelRunner.THNEED
elif ModelRunner.SNPE in paths and USE_SNPE:
from openpilot.sunnypilot.modeld.runners.snpemodel_pyx import SNPEModel as Runner
runner_type = ModelRunner.SNPE
elif ModelRunner.ONNX in paths:
from openpilot.sunnypilot.modeld.runners.onnxmodel import ONNXModel as Runner
runner_type = ModelRunner.ONNX
else:
raise Exception("Couldn't select a model runner, make sure to pass at least one valid model path")
return Runner(str(paths[runner_type]), *args, **kwargs)

View File

@@ -0,0 +1,71 @@
import os
import onnx
import sys
import numpy as np
from typing import Any
from openpilot.sunnypilot.modeld.runners.runmodel_pyx import RunModel
from openpilot.sunnypilot.modeld.runners.ort_helpers import convert_fp16_to_fp32, ORT_TYPES_TO_NP_TYPES
def create_ort_session(path, fp16_to_fp32):
os.environ["OMP_NUM_THREADS"] = "4"
os.environ["OMP_WAIT_POLICY"] = "PASSIVE"
import onnxruntime as ort
print("Onnx available providers: ", ort.get_available_providers(), file=sys.stderr)
options = ort.SessionOptions()
options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_DISABLE_ALL
provider: str | tuple[str, dict[Any, Any]]
if 'OpenVINOExecutionProvider' in ort.get_available_providers() and 'ONNXCPU' not in os.environ:
provider = 'OpenVINOExecutionProvider'
elif 'CUDAExecutionProvider' in ort.get_available_providers() and 'ONNXCPU' not in os.environ:
options.intra_op_num_threads = 2
provider = ('CUDAExecutionProvider', {'cudnn_conv_algo_search': 'EXHAUSTIVE'})
else:
options.intra_op_num_threads = 2
options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
provider = 'CPUExecutionProvider'
model_data = convert_fp16_to_fp32(onnx.load(path)) if fp16_to_fp32 else path
print("Onnx selected provider: ", [provider], file=sys.stderr)
ort_session = ort.InferenceSession(model_data, options, providers=[provider])
print("Onnx using ", ort_session.get_providers(), file=sys.stderr)
return ort_session
class ONNXModel(RunModel):
def __init__(self, path, output, runtime, use_tf8, cl_context):
self.inputs = {}
self.output = output
self.session = create_ort_session(path, fp16_to_fp32=True)
self.input_names = [x.name for x in self.session.get_inputs()]
self.input_shapes = {x.name: [1, *x.shape[1:]] for x in self.session.get_inputs()}
self.input_dtypes = {x.name: ORT_TYPES_TO_NP_TYPES[x.type] for x in self.session.get_inputs()}
# run once to initialize CUDA provider
if "CUDAExecutionProvider" in self.session.get_providers():
self.session.run(None, {k: np.zeros(self.input_shapes[k], dtype=self.input_dtypes[k]) for k in self.input_names})
print("ready to run onnx model", self.input_shapes, file=sys.stderr)
def addInput(self, name, buffer):
assert name in self.input_names
self.inputs[name] = buffer
def setInputBuffer(self, name, buffer):
assert name in self.inputs
self.inputs[name] = buffer
def getCLBuffer(self, name):
return None
def execute(self):
inputs = {k: v.view(self.input_dtypes[k]) for k,v in self.inputs.items()}
inputs = {k: v.reshape(self.input_shapes[k]).astype(self.input_dtypes[k]) for k,v in inputs.items()}
outputs = self.session.run(None, inputs)
assert len(outputs) == 1, "Only single model outputs are supported"
self.output[:] = outputs[0]
return self.output

View File

@@ -0,0 +1,36 @@
import onnx
import onnxruntime as ort
import numpy as np
import itertools
ORT_TYPES_TO_NP_TYPES = {'tensor(float16)': np.float16, 'tensor(float)': np.float32, 'tensor(uint8)': np.uint8}
def attributeproto_fp16_to_fp32(attr):
float32_list = np.frombuffer(attr.raw_data, dtype=np.float16)
attr.data_type = 1
attr.raw_data = float32_list.astype(np.float32).tobytes()
def convert_fp16_to_fp32(model):
for i in model.graph.initializer:
if i.data_type == 10:
attributeproto_fp16_to_fp32(i)
for i in itertools.chain(model.graph.input, model.graph.output):
if i.type.tensor_type.elem_type == 10:
i.type.tensor_type.elem_type = 1
for i in model.graph.node:
if i.op_type == 'Cast' and i.attribute[0].i == 10:
i.attribute[0].i = 1
for a in i.attribute:
if hasattr(a, 't'):
if a.t.data_type == 10:
attributeproto_fp16_to_fp32(a.t)
return model.SerializeToString()
def make_onnx_cpu_runner(model_path):
options = ort.SessionOptions()
options.intra_op_num_threads = 4
options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
model_data = convert_fp16_to_fp32(onnx.load(model_path))
return ort.InferenceSession(model_data, options, providers=['CPUExecutionProvider'])

View File

@@ -0,0 +1,4 @@
#pragma once
#include "sunnypilot/modeld/runners/runmodel.h"
#include "sunnypilot/modeld/runners/snpemodel.h"

View File

@@ -0,0 +1,49 @@
#pragma once
#include <string>
#include <vector>
#include <memory>
#include <cassert>
#include "common/clutil.h"
#include "common/swaglog.h"
#define USE_CPU_RUNTIME 0
#define USE_GPU_RUNTIME 1
#define USE_DSP_RUNTIME 2
struct ModelInput {
const std::string name;
float *buffer;
int size;
ModelInput(const std::string _name, float *_buffer, int _size) : name(_name), buffer(_buffer), size(_size) {}
virtual void setBuffer(float *_buffer, int _size) {
assert(size == _size || size == 0);
buffer = _buffer;
size = _size;
}
};
class RunModel {
public:
std::vector<std::unique_ptr<ModelInput>> inputs;
virtual ~RunModel() {}
virtual void execute() {}
virtual void* getCLBuffer(const std::string name) { return nullptr; }
virtual void addInput(const std::string name, float *buffer, int size) {
inputs.push_back(std::unique_ptr<ModelInput>(new ModelInput(name, buffer, size)));
}
virtual void setInputBuffer(const std::string name, float *buffer, int size) {
for (auto &input : inputs) {
if (name == input->name) {
input->setBuffer(buffer, size);
return;
}
}
LOGE("Tried to update input `%s` but no input with this name exists", name.c_str());
assert(false);
}
};

View File

@@ -0,0 +1,14 @@
# distutils: language = c++
from libcpp.string cimport string
cdef extern from "sunnypilot/modeld/runners/runmodel.h":
cdef int USE_CPU_RUNTIME
cdef int USE_GPU_RUNTIME
cdef int USE_DSP_RUNTIME
cdef cppclass RunModel:
void addInput(string, float*, int)
void setInputBuffer(string, float*, int)
void * getCLBuffer(string)
void execute()

View File

@@ -0,0 +1,6 @@
# distutils: language = c++
from .runmodel cimport RunModel as cppRunModel
cdef class RunModel:
cdef cppRunModel * model

View File

@@ -0,0 +1,37 @@
# distutils: language = c++
# cython: c_string_encoding=ascii, language_level=3
from libcpp.string cimport string
from .runmodel cimport USE_CPU_RUNTIME, USE_GPU_RUNTIME, USE_DSP_RUNTIME
from sunnypilot.modeld.models.commonmodel_pyx cimport CLMem
class Runtime:
CPU = USE_CPU_RUNTIME
GPU = USE_GPU_RUNTIME
DSP = USE_DSP_RUNTIME
cdef class RunModel:
def __dealloc__(self):
del self.model
def addInput(self, string name, float[:] buffer):
if buffer is not None:
self.model.addInput(name, &buffer[0], len(buffer))
else:
self.model.addInput(name, NULL, 0)
def setInputBuffer(self, string name, float[:] buffer):
if buffer is not None:
self.model.setInputBuffer(name, &buffer[0], len(buffer))
else:
self.model.setInputBuffer(name, NULL, 0)
def getCLBuffer(self, string name):
cdef void * cl_buf = self.model.getCLBuffer(name)
if not cl_buf:
return None
return CLMem.create(cl_buf)
def execute(self):
self.model.execute()

View File

@@ -0,0 +1,116 @@
#pragma clang diagnostic ignored "-Wexceptions"
#include "sunnypilot/modeld/runners/snpemodel.h"
#include <cstring>
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "common/util.h"
#include "common/timing.h"
void PrintErrorStringAndExit() {
std::cerr << zdl::DlSystem::getLastErrorString() << std::endl;
std::exit(EXIT_FAILURE);
}
SNPEModel::SNPEModel(const std::string path, float *_output, size_t _output_size, int runtime, bool _use_tf8, cl_context context) {
output = _output;
output_size = _output_size;
use_tf8 = _use_tf8;
#ifdef QCOM2
if (runtime == USE_GPU_RUNTIME) {
snpe_runtime = zdl::DlSystem::Runtime_t::GPU;
} else if (runtime == USE_DSP_RUNTIME) {
snpe_runtime = zdl::DlSystem::Runtime_t::DSP;
} else {
snpe_runtime = zdl::DlSystem::Runtime_t::CPU;
}
assert(zdl::SNPE::SNPEFactory::isRuntimeAvailable(snpe_runtime));
#endif
model_data = util::read_file(path);
assert(model_data.size() > 0);
// load model
std::unique_ptr<zdl::DlContainer::IDlContainer> container = zdl::DlContainer::IDlContainer::open((uint8_t*)model_data.data(), model_data.size());
if (!container) { PrintErrorStringAndExit(); }
LOGW("loaded model with size: %lu", model_data.size());
// create model runner
zdl::SNPE::SNPEBuilder snpe_builder(container.get());
while (!snpe) {
#ifdef QCOM2
snpe = snpe_builder.setOutputLayers({})
.setRuntimeProcessor(snpe_runtime)
.setUseUserSuppliedBuffers(true)
.setPerformanceProfile(zdl::DlSystem::PerformanceProfile_t::HIGH_PERFORMANCE)
.build();
#else
snpe = snpe_builder.setOutputLayers({})
.setUseUserSuppliedBuffers(true)
.setPerformanceProfile(zdl::DlSystem::PerformanceProfile_t::HIGH_PERFORMANCE)
.build();
#endif
if (!snpe) std::cerr << zdl::DlSystem::getLastErrorString() << std::endl;
}
// create output buffer
zdl::DlSystem::UserBufferEncodingFloat ub_encoding_float;
zdl::DlSystem::IUserBufferFactory &ub_factory = zdl::SNPE::SNPEFactory::getUserBufferFactory();
const auto &output_tensor_names_opt = snpe->getOutputTensorNames();
if (!output_tensor_names_opt) throw std::runtime_error("Error obtaining output tensor names");
const auto &output_tensor_names = *output_tensor_names_opt;
assert(output_tensor_names.size() == 1);
const char *output_tensor_name = output_tensor_names.at(0);
const zdl::DlSystem::TensorShape &buffer_shape = snpe->getInputOutputBufferAttributes(output_tensor_name)->getDims();
if (output_size != 0) {
assert(output_size == buffer_shape[1]);
} else {
output_size = buffer_shape[1];
}
std::vector<size_t> output_strides = {output_size * sizeof(float), sizeof(float)};
output_buffer = ub_factory.createUserBuffer(output, output_size * sizeof(float), output_strides, &ub_encoding_float);
output_map.add(output_tensor_name, output_buffer.get());
}
void SNPEModel::addInput(const std::string name, float *buffer, int size) {
const int idx = inputs.size();
const auto &input_tensor_names_opt = snpe->getInputTensorNames();
if (!input_tensor_names_opt) throw std::runtime_error("Error obtaining input tensor names");
const auto &input_tensor_names = *input_tensor_names_opt;
const char *input_tensor_name = input_tensor_names.at(idx);
const bool input_tf8 = use_tf8 && strcmp(input_tensor_name, "input_img") == 0; // TODO: This is a terrible hack, get rid of this name check both here and in onnx_runner.py
LOGW("adding index %d: %s", idx, input_tensor_name);
zdl::DlSystem::UserBufferEncodingFloat ub_encoding_float;
zdl::DlSystem::UserBufferEncodingTf8 ub_encoding_tf8(0, 1./255); // network takes 0-1
zdl::DlSystem::IUserBufferFactory &ub_factory = zdl::SNPE::SNPEFactory::getUserBufferFactory();
zdl::DlSystem::UserBufferEncoding *input_encoding = input_tf8 ? (zdl::DlSystem::UserBufferEncoding*)&ub_encoding_tf8 : (zdl::DlSystem::UserBufferEncoding*)&ub_encoding_float;
const auto &buffer_shape_opt = snpe->getInputDimensions(input_tensor_name);
const zdl::DlSystem::TensorShape &buffer_shape = *buffer_shape_opt;
size_t size_of_input = input_tf8 ? sizeof(uint8_t) : sizeof(float);
std::vector<size_t> strides(buffer_shape.rank());
strides[strides.size() - 1] = size_of_input;
size_t product = 1;
for (size_t i = 0; i < buffer_shape.rank(); i++) product *= buffer_shape[i];
size_t stride = strides[strides.size() - 1];
for (size_t i = buffer_shape.rank() - 1; i > 0; i--) {
stride *= buffer_shape[i];
strides[i-1] = stride;
}
auto input_buffer = ub_factory.createUserBuffer(buffer, product*size_of_input, strides, input_encoding);
input_map.add(input_tensor_name, input_buffer.get());
inputs.push_back(std::unique_ptr<SNPEModelInput>(new SNPEModelInput(name, buffer, size, std::move(input_buffer))));
}
void SNPEModel::execute() {
if (!snpe->execute(input_map, output_map)) {
PrintErrorStringAndExit();
}
}

View File

@@ -0,0 +1,52 @@
#pragma once
#pragma clang diagnostic ignored "-Wdeprecated-declarations"
#include <memory>
#include <string>
#include <utility>
#include <DlContainer/IDlContainer.hpp>
#include <DlSystem/DlError.hpp>
#include <DlSystem/ITensor.hpp>
#include <DlSystem/ITensorFactory.hpp>
#include <DlSystem/IUserBuffer.hpp>
#include <DlSystem/IUserBufferFactory.hpp>
#include <SNPE/SNPE.hpp>
#include <SNPE/SNPEBuilder.hpp>
#include <SNPE/SNPEFactory.hpp>
#include "sunnypilot/modeld/runners/runmodel.h"
struct SNPEModelInput : public ModelInput {
std::unique_ptr<zdl::DlSystem::IUserBuffer> snpe_buffer;
SNPEModelInput(const std::string _name, float *_buffer, int _size, std::unique_ptr<zdl::DlSystem::IUserBuffer> _snpe_buffer) : ModelInput(_name, _buffer, _size), snpe_buffer(std::move(_snpe_buffer)) {}
void setBuffer(float *_buffer, int _size) {
ModelInput::setBuffer(_buffer, _size);
assert(snpe_buffer->setBufferAddress(_buffer) == true);
}
};
class SNPEModel : public RunModel {
public:
SNPEModel(const std::string path, float *_output, size_t _output_size, int runtime, bool use_tf8 = false, cl_context context = NULL);
void addInput(const std::string name, float *buffer, int size);
void execute();
private:
std::string model_data;
#ifdef QCOM2
zdl::DlSystem::Runtime_t snpe_runtime;
#endif
// snpe model stuff
std::unique_ptr<zdl::SNPE::SNPE> snpe;
zdl::DlSystem::UserBufferMap input_map;
zdl::DlSystem::UserBufferMap output_map;
std::unique_ptr<zdl::DlSystem::IUserBuffer> output_buffer;
bool use_tf8;
float *output;
size_t output_size;
};

View File

@@ -0,0 +1,9 @@
# distutils: language = c++
from libcpp.string cimport string
from msgq.visionipc.visionipc cimport cl_context
cdef extern from "sunnypilot/modeld/runners/snpemodel.h":
cdef cppclass SNPEModel:
SNPEModel(string, float*, size_t, int, bool, cl_context)

View File

@@ -0,0 +1,17 @@
# distutils: language = c++
# cython: c_string_encoding=ascii, language_level=3
import os
from libcpp cimport bool
from libcpp.string cimport string
from .snpemodel cimport SNPEModel as cppSNPEModel
from sunnypilot.modeld.models.commonmodel_pyx cimport CLContext
from sunnypilot.modeld.runners.runmodel_pyx cimport RunModel
from sunnypilot.modeld.runners.runmodel cimport RunModel as cppRunModel
os.environ['ADSP_LIBRARY_PATH'] = "/data/pythonpath/third_party/snpe/dsp/"
cdef class SNPEModel(RunModel):
def __cinit__(self, string path, float[:] output, int runtime, bool use_tf8, CLContext context):
self.model = <cppRunModel *> new cppSNPEModel(path, &output[0], len(output), runtime, use_tf8, context.context)

View File

@@ -0,0 +1,58 @@
#include "sunnypilot/modeld/runners/thneedmodel.h"
#include <string>
#include "common/swaglog.h"
ThneedModel::ThneedModel(const std::string path, float *_output, size_t _output_size, int runtime, bool luse_tf8, cl_context context) {
thneed = new Thneed(true, context);
thneed->load(path.c_str());
thneed->clexec();
recorded = false;
output = _output;
}
void* ThneedModel::getCLBuffer(const std::string name) {
int index = -1;
for (int i = 0; i < inputs.size(); i++) {
if (name == inputs[i]->name) {
index = i;
break;
}
}
if (index == -1) {
LOGE("Tried to get CL buffer for input `%s` but no input with this name exists", name.c_str());
assert(false);
}
if (thneed->input_clmem.size() >= inputs.size()) {
return &thneed->input_clmem[inputs.size() - index - 1];
} else {
return nullptr;
}
}
void ThneedModel::execute() {
if (!recorded) {
thneed->record = true;
float *input_buffers[inputs.size()];
for (int i = 0; i < inputs.size(); i++) {
input_buffers[inputs.size() - i - 1] = inputs[i]->buffer;
}
thneed->copy_inputs(input_buffers);
thneed->clexec();
thneed->copy_output(output);
thneed->stop();
recorded = true;
} else {
float *input_buffers[inputs.size()];
for (int i = 0; i < inputs.size(); i++) {
input_buffers[inputs.size() - i - 1] = inputs[i]->buffer;
}
thneed->execute(input_buffers, output);
}
}

View File

@@ -0,0 +1,17 @@
#pragma once
#include <string>
#include "sunnypilot/modeld/runners/runmodel.h"
#include "sunnypilot/modeld/thneed/thneed.h"
class ThneedModel : public RunModel {
public:
ThneedModel(const std::string path, float *_output, size_t _output_size, int runtime, bool use_tf8 = false, cl_context context = NULL);
void *getCLBuffer(const std::string name);
void execute();
private:
Thneed *thneed = NULL;
bool recorded;
float *output;
};

View File

@@ -0,0 +1,9 @@
# distutils: language = c++
from libcpp.string cimport string
from msgq.visionipc.visionipc cimport cl_context
cdef extern from "sunnypilot/modeld/runners/thneedmodel.h":
cdef cppclass ThneedModel:
ThneedModel(string, float*, size_t, int, bool, cl_context)

View File

@@ -0,0 +1,14 @@
# distutils: language = c++
# cython: c_string_encoding=ascii, language_level=3
from libcpp cimport bool
from libcpp.string cimport string
from .thneedmodel cimport ThneedModel as cppThneedModel
from sunnypilot.modeld.models.commonmodel_pyx cimport CLContext
from sunnypilot.modeld.runners.runmodel_pyx cimport RunModel
from sunnypilot.modeld.runners.runmodel cimport RunModel as cppRunModel
cdef class ThneedModel(RunModel):
def __cinit__(self, string path, float[:] output, int runtime, bool use_tf8, CLContext context):
self.model = <cppRunModel *> new cppThneedModel(path, &output[0], len(output), runtime, use_tf8, context.context)

View File

@@ -0,0 +1,8 @@
thneed is an SNPE accelerator. I know SNPE is already an accelerator, but sometimes things need to go even faster..
It runs on the local device, and caches a single model run. Then it replays it, but fast.
thneed slices through abstraction layers like a fish.
You need a thneed.

View File

View File

@@ -0,0 +1,154 @@
#include <cassert>
#include <set>
#include "third_party/json11/json11.hpp"
#include "common/util.h"
#include "common/clutil.h"
#include "common/swaglog.h"
#include "sunnypilot/modeld/thneed/thneed.h"
using namespace json11;
extern map<cl_program, string> g_program_source;
void Thneed::load(const char *filename) {
LOGD("Thneed::load: loading from %s\n", filename);
string buf = util::read_file(filename);
int jsz = *(int *)buf.data();
string jsonerr;
string jj(buf.data() + sizeof(int), jsz);
Json jdat = Json::parse(jj, jsonerr);
map<cl_mem, cl_mem> real_mem;
real_mem[NULL] = NULL;
int ptr = sizeof(int)+jsz;
for (auto &obj : jdat["objects"].array_items()) {
auto mobj = obj.object_items();
int sz = mobj["size"].int_value();
cl_mem clbuf = NULL;
if (mobj["buffer_id"].string_value().size() > 0) {
// image buffer must already be allocated
clbuf = real_mem[*(cl_mem*)(mobj["buffer_id"].string_value().data())];
assert(mobj["needs_load"].bool_value() == false);
} else {
if (mobj["needs_load"].bool_value()) {
clbuf = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, sz, &buf[ptr], NULL);
if (debug >= 1) printf("loading %p %d @ 0x%X\n", clbuf, sz, ptr);
ptr += sz;
} else {
// TODO: is there a faster way to init zeroed out buffers?
void *host_zeros = calloc(sz, 1);
clbuf = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, sz, host_zeros, NULL);
free(host_zeros);
}
}
assert(clbuf != NULL);
if (mobj["arg_type"] == "image2d_t" || mobj["arg_type"] == "image1d_t") {
cl_image_desc desc = {0};
desc.image_type = (mobj["arg_type"] == "image2d_t") ? CL_MEM_OBJECT_IMAGE2D : CL_MEM_OBJECT_IMAGE1D_BUFFER;
desc.image_width = mobj["width"].int_value();
desc.image_height = mobj["height"].int_value();
desc.image_row_pitch = mobj["row_pitch"].int_value();
assert(sz == desc.image_height*desc.image_row_pitch);
#ifdef QCOM2
desc.buffer = clbuf;
#else
// TODO: we are creating unused buffers on PC
clReleaseMemObject(clbuf);
#endif
cl_image_format format = {0};
format.image_channel_order = CL_RGBA;
format.image_channel_data_type = mobj["float32"].bool_value() ? CL_FLOAT : CL_HALF_FLOAT;
cl_int errcode;
#ifndef QCOM2
if (mobj["needs_load"].bool_value()) {
clbuf = clCreateImage(context, CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, &format, &desc, &buf[ptr-sz], &errcode);
} else {
clbuf = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, &errcode);
}
#else
clbuf = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, &errcode);
#endif
if (clbuf == NULL) {
LOGE("clError: %s create image %zux%zu rp %zu with buffer %p\n", cl_get_error_string(errcode),
desc.image_width, desc.image_height, desc.image_row_pitch, desc.buffer);
}
assert(clbuf != NULL);
}
real_mem[*(cl_mem*)(mobj["id"].string_value().data())] = clbuf;
}
map<string, cl_program> g_programs;
for (const auto &[name, source] : jdat["programs"].object_items()) {
if (debug >= 1) printf("building %s with size %zu\n", name.c_str(), source.string_value().size());
g_programs[name] = cl_program_from_source(context, device_id, source.string_value());
}
for (auto &obj : jdat["inputs"].array_items()) {
auto mobj = obj.object_items();
int sz = mobj["size"].int_value();
cl_mem aa = real_mem[*(cl_mem*)(mobj["buffer_id"].string_value().data())];
input_clmem.push_back(aa);
input_sizes.push_back(sz);
LOGD("Thneed::load: adding input %s with size %d\n", mobj["name"].string_value().data(), sz);
cl_int cl_err;
void *ret = clEnqueueMapBuffer(command_queue, aa, CL_TRUE, CL_MAP_WRITE, 0, sz, 0, NULL, NULL, &cl_err);
if (cl_err != CL_SUCCESS) LOGE("clError: %s map %p %d\n", cl_get_error_string(cl_err), aa, sz);
assert(cl_err == CL_SUCCESS);
inputs.push_back(ret);
}
for (auto &obj : jdat["outputs"].array_items()) {
auto mobj = obj.object_items();
int sz = mobj["size"].int_value();
LOGD("Thneed::save: adding output with size %d\n", sz);
// TODO: support multiple outputs
output = real_mem[*(cl_mem*)(mobj["buffer_id"].string_value().data())];
assert(output != NULL);
}
for (auto &obj : jdat["binaries"].array_items()) {
string name = obj["name"].string_value();
size_t length = obj["length"].int_value();
if (debug >= 1) printf("binary %s with size %zu\n", name.c_str(), length);
g_programs[name] = cl_program_from_binary(context, device_id, (const uint8_t*)&buf[ptr], length);
ptr += length;
}
for (auto &obj : jdat["kernels"].array_items()) {
auto gws = obj["global_work_size"];
auto lws = obj["local_work_size"];
auto kk = shared_ptr<CLQueuedKernel>(new CLQueuedKernel(this));
kk->name = obj["name"].string_value();
kk->program = g_programs[kk->name];
kk->work_dim = obj["work_dim"].int_value();
for (int i = 0; i < kk->work_dim; i++) {
kk->global_work_size[i] = gws[i].int_value();
kk->local_work_size[i] = lws[i].int_value();
}
kk->num_args = obj["num_args"].int_value();
for (int i = 0; i < kk->num_args; i++) {
string arg = obj["args"].array_items()[i].string_value();
int arg_size = obj["args_size"].array_items()[i].int_value();
kk->args_size.push_back(arg_size);
if (arg_size == 8) {
cl_mem val = *(cl_mem*)(arg.data());
val = real_mem[val];
kk->args.push_back(string((char*)&val, sizeof(val)));
} else {
kk->args.push_back(arg);
}
}
kq.push_back(kk);
}
clFinish(command_queue);
}

View File

@@ -0,0 +1,133 @@
#pragma once
#ifndef __user
#define __user __attribute__(())
#endif
#include <cstdint>
#include <cstdlib>
#include <memory>
#include <string>
#include <vector>
#include <CL/cl.h>
#include "third_party/linux/include/msm_kgsl.h"
using namespace std;
cl_int thneed_clSetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value);
namespace json11 {
class Json;
}
class Thneed;
class GPUMalloc {
public:
GPUMalloc(int size, int fd);
~GPUMalloc();
void *alloc(int size);
private:
uint64_t base;
int remaining;
};
class CLQueuedKernel {
public:
CLQueuedKernel(Thneed *lthneed) { thneed = lthneed; }
CLQueuedKernel(Thneed *lthneed,
cl_kernel _kernel,
cl_uint _work_dim,
const size_t *_global_work_size,
const size_t *_local_work_size);
cl_int exec();
void debug_print(bool verbose);
int get_arg_num(const char *search_arg_name);
cl_program program;
string name;
cl_uint num_args;
vector<string> arg_names;
vector<string> arg_types;
vector<string> args;
vector<int> args_size;
cl_kernel kernel = NULL;
json11::Json to_json() const;
cl_uint work_dim;
size_t global_work_size[3] = {0};
size_t local_work_size[3] = {0};
private:
Thneed *thneed;
};
class CachedIoctl {
public:
virtual void exec() {}
};
class CachedSync: public CachedIoctl {
public:
CachedSync(Thneed *lthneed, string ldata) { thneed = lthneed; data = ldata; }
void exec();
private:
Thneed *thneed;
string data;
};
class CachedCommand: public CachedIoctl {
public:
CachedCommand(Thneed *lthneed, struct kgsl_gpu_command *cmd);
void exec();
private:
void disassemble(int cmd_index);
struct kgsl_gpu_command cache;
unique_ptr<kgsl_command_object[]> cmds;
unique_ptr<kgsl_command_object[]> objs;
Thneed *thneed;
vector<shared_ptr<CLQueuedKernel> > kq;
};
class Thneed {
public:
Thneed(bool do_clinit=false, cl_context _context = NULL);
void stop();
void execute(float **finputs, float *foutput, bool slow=false);
void wait();
vector<cl_mem> input_clmem;
vector<void *> inputs;
vector<size_t> input_sizes;
cl_mem output = NULL;
cl_context context = NULL;
cl_command_queue command_queue;
cl_device_id device_id;
int context_id;
// protected?
bool record = false;
int debug;
int timestamp;
#ifdef QCOM2
unique_ptr<GPUMalloc> ram;
vector<unique_ptr<CachedIoctl> > cmds;
int fd;
#endif
// all CL kernels
void copy_inputs(float **finputs, bool internal=false);
void copy_output(float *foutput);
cl_int clexec();
vector<shared_ptr<CLQueuedKernel> > kq;
// pending CL kernels
vector<shared_ptr<CLQueuedKernel> > ckq;
// loading
void load(const char *filename);
private:
void clinit();
};

View File

@@ -0,0 +1,216 @@
#include "sunnypilot/modeld/thneed/thneed.h"
#include <cassert>
#include <cstring>
#include <map>
#include "common/clutil.h"
#include "common/timing.h"
map<pair<cl_kernel, int>, string> g_args;
map<pair<cl_kernel, int>, int> g_args_size;
map<cl_program, string> g_program_source;
void Thneed::stop() {
//printf("Thneed::stop: recorded %lu commands\n", cmds.size());
record = false;
}
void Thneed::clinit() {
device_id = cl_get_device_id(CL_DEVICE_TYPE_DEFAULT);
if (context == NULL) context = CL_CHECK_ERR(clCreateContext(NULL, 1, &device_id, NULL, NULL, &err));
//cl_command_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
cl_command_queue_properties props[3] = {CL_QUEUE_PROPERTIES, 0, 0};
command_queue = CL_CHECK_ERR(clCreateCommandQueueWithProperties(context, device_id, props, &err));
printf("Thneed::clinit done\n");
}
cl_int Thneed::clexec() {
if (debug >= 1) printf("Thneed::clexec: running %lu queued kernels\n", kq.size());
for (auto &k : kq) {
if (record) ckq.push_back(k);
cl_int ret = k->exec();
assert(ret == CL_SUCCESS);
}
return clFinish(command_queue);
}
void Thneed::copy_inputs(float **finputs, bool internal) {
for (int idx = 0; idx < inputs.size(); ++idx) {
if (debug >= 1) printf("copying %lu -- %p -> %p (cl %p)\n", input_sizes[idx], finputs[idx], inputs[idx], input_clmem[idx]);
if (internal) {
// if it's internal, using memcpy is fine since the buffer sync is cached in the ioctl layer
if (finputs[idx] != NULL) memcpy(inputs[idx], finputs[idx], input_sizes[idx]);
} else {
if (finputs[idx] != NULL) CL_CHECK(clEnqueueWriteBuffer(command_queue, input_clmem[idx], CL_TRUE, 0, input_sizes[idx], finputs[idx], 0, NULL, NULL));
}
}
}
void Thneed::copy_output(float *foutput) {
if (output != NULL) {
size_t sz;
clGetMemObjectInfo(output, CL_MEM_SIZE, sizeof(sz), &sz, NULL);
if (debug >= 1) printf("copying %lu for output %p -> %p\n", sz, output, foutput);
CL_CHECK(clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, sz, foutput, 0, NULL, NULL));
} else {
printf("CAUTION: model output is NULL, does it have no outputs?\n");
}
}
// *********** CLQueuedKernel ***********
CLQueuedKernel::CLQueuedKernel(Thneed *lthneed,
cl_kernel _kernel,
cl_uint _work_dim,
const size_t *_global_work_size,
const size_t *_local_work_size) {
thneed = lthneed;
kernel = _kernel;
work_dim = _work_dim;
assert(work_dim <= 3);
for (int i = 0; i < work_dim; i++) {
global_work_size[i] = _global_work_size[i];
local_work_size[i] = _local_work_size[i];
}
char _name[0x100];
clGetKernelInfo(kernel, CL_KERNEL_FUNCTION_NAME, sizeof(_name), _name, NULL);
name = string(_name);
clGetKernelInfo(kernel, CL_KERNEL_NUM_ARGS, sizeof(num_args), &num_args, NULL);
// get args
for (int i = 0; i < num_args; i++) {
char arg_name[0x100] = {0};
clGetKernelArgInfo(kernel, i, CL_KERNEL_ARG_NAME, sizeof(arg_name), arg_name, NULL);
arg_names.push_back(string(arg_name));
clGetKernelArgInfo(kernel, i, CL_KERNEL_ARG_TYPE_NAME, sizeof(arg_name), arg_name, NULL);
arg_types.push_back(string(arg_name));
args.push_back(g_args[make_pair(kernel, i)]);
args_size.push_back(g_args_size[make_pair(kernel, i)]);
}
// get program
clGetKernelInfo(kernel, CL_KERNEL_PROGRAM, sizeof(program), &program, NULL);
}
int CLQueuedKernel::get_arg_num(const char *search_arg_name) {
for (int i = 0; i < num_args; i++) {
if (arg_names[i] == search_arg_name) return i;
}
printf("failed to find %s in %s\n", search_arg_name, name.c_str());
assert(false);
}
cl_int CLQueuedKernel::exec() {
if (kernel == NULL) {
kernel = clCreateKernel(program, name.c_str(), NULL);
arg_names.clear();
arg_types.clear();
for (int j = 0; j < num_args; j++) {
char arg_name[0x100] = {0};
clGetKernelArgInfo(kernel, j, CL_KERNEL_ARG_NAME, sizeof(arg_name), arg_name, NULL);
arg_names.push_back(string(arg_name));
clGetKernelArgInfo(kernel, j, CL_KERNEL_ARG_TYPE_NAME, sizeof(arg_name), arg_name, NULL);
arg_types.push_back(string(arg_name));
cl_int ret;
if (args[j].size() != 0) {
assert(args[j].size() == args_size[j]);
ret = thneed_clSetKernelArg(kernel, j, args[j].size(), args[j].data());
} else {
ret = thneed_clSetKernelArg(kernel, j, args_size[j], NULL);
}
assert(ret == CL_SUCCESS);
}
}
if (thneed->debug >= 1) {
debug_print(thneed->debug >= 2);
}
return clEnqueueNDRangeKernel(thneed->command_queue,
kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, NULL);
}
void CLQueuedKernel::debug_print(bool verbose) {
printf("%p %56s -- ", kernel, name.c_str());
for (int i = 0; i < work_dim; i++) {
printf("%4zu ", global_work_size[i]);
}
printf(" -- ");
for (int i = 0; i < work_dim; i++) {
printf("%4zu ", local_work_size[i]);
}
printf("\n");
if (verbose) {
for (int i = 0; i < num_args; i++) {
string arg = args[i];
printf(" %s %s", arg_types[i].c_str(), arg_names[i].c_str());
void *arg_value = (void*)arg.data();
int arg_size = arg.size();
if (arg_size == 0) {
printf(" (size) %d", args_size[i]);
} else if (arg_size == 1) {
printf(" = %d", *((char*)arg_value));
} else if (arg_size == 2) {
printf(" = %d", *((short*)arg_value));
} else if (arg_size == 4) {
if (arg_types[i] == "float") {
printf(" = %f", *((float*)arg_value));
} else {
printf(" = %d", *((int*)arg_value));
}
} else if (arg_size == 8) {
cl_mem val = (cl_mem)(*((uintptr_t*)arg_value));
printf(" = %p", val);
if (val != NULL) {
cl_mem_object_type obj_type;
clGetMemObjectInfo(val, CL_MEM_TYPE, sizeof(obj_type), &obj_type, NULL);
if (arg_types[i] == "image2d_t" || arg_types[i] == "image1d_t" || obj_type == CL_MEM_OBJECT_IMAGE2D) {
cl_image_format format;
size_t width, height, depth, array_size, row_pitch, slice_pitch;
cl_mem buf;
clGetImageInfo(val, CL_IMAGE_FORMAT, sizeof(format), &format, NULL);
assert(format.image_channel_order == CL_RGBA);
assert(format.image_channel_data_type == CL_HALF_FLOAT || format.image_channel_data_type == CL_FLOAT);
clGetImageInfo(val, CL_IMAGE_WIDTH, sizeof(width), &width, NULL);
clGetImageInfo(val, CL_IMAGE_HEIGHT, sizeof(height), &height, NULL);
clGetImageInfo(val, CL_IMAGE_ROW_PITCH, sizeof(row_pitch), &row_pitch, NULL);
clGetImageInfo(val, CL_IMAGE_DEPTH, sizeof(depth), &depth, NULL);
clGetImageInfo(val, CL_IMAGE_ARRAY_SIZE, sizeof(array_size), &array_size, NULL);
clGetImageInfo(val, CL_IMAGE_SLICE_PITCH, sizeof(slice_pitch), &slice_pitch, NULL);
assert(depth == 0);
assert(array_size == 0);
assert(slice_pitch == 0);
clGetImageInfo(val, CL_IMAGE_BUFFER, sizeof(buf), &buf, NULL);
size_t sz = 0;
if (buf != NULL) clGetMemObjectInfo(buf, CL_MEM_SIZE, sizeof(sz), &sz, NULL);
printf(" image %zu x %zu rp %zu @ %p buffer %zu", width, height, row_pitch, buf, sz);
} else {
size_t sz;
clGetMemObjectInfo(val, CL_MEM_SIZE, sizeof(sz), &sz, NULL);
printf(" buffer %zu", sz);
}
}
}
printf("\n");
}
}
}
cl_int thneed_clSetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value) {
g_args_size[make_pair(kernel, arg_index)] = arg_size;
if (arg_value != NULL) {
g_args[make_pair(kernel, arg_index)] = string((char*)arg_value, arg_size);
} else {
g_args[make_pair(kernel, arg_index)] = string("");
}
cl_int ret = clSetKernelArg(kernel, arg_index, arg_size, arg_value);
return ret;
}

View File

@@ -0,0 +1,32 @@
#include "sunnypilot/modeld/thneed/thneed.h"
#include <cassert>
#include "common/clutil.h"
#include "common/timing.h"
Thneed::Thneed(bool do_clinit, cl_context _context) {
context = _context;
if (do_clinit) clinit();
char *thneed_debug_env = getenv("THNEED_DEBUG");
debug = (thneed_debug_env != NULL) ? atoi(thneed_debug_env) : 0;
}
void Thneed::execute(float **finputs, float *foutput, bool slow) {
uint64_t tb, te;
if (debug >= 1) tb = nanos_since_boot();
// ****** copy inputs
copy_inputs(finputs);
// ****** run commands
clexec();
// ****** copy outputs
copy_output(foutput);
if (debug >= 1) {
te = nanos_since_boot();
printf("model exec in %lu us\n", (te-tb)/1000);
}
}

View File

@@ -0,0 +1,258 @@
#include "sunnypilot/modeld/thneed/thneed.h"
#include <dlfcn.h>
#include <sys/mman.h>
#include <cassert>
#include <cerrno>
#include <cstring>
#include <map>
#include <string>
#include "common/clutil.h"
#include "common/timing.h"
Thneed *g_thneed = NULL;
int g_fd = -1;
void hexdump(uint8_t *d, int len) {
assert((len%4) == 0);
printf(" dumping %p len 0x%x\n", d, len);
for (int i = 0; i < len/4; i++) {
if (i != 0 && (i%0x10) == 0) printf("\n");
printf("%8x ", d[i]);
}
printf("\n");
}
// *********** ioctl interceptor ***********
extern "C" {
int (*my_ioctl)(int filedes, unsigned long request, void *argp) = NULL;
#undef ioctl
int ioctl(int filedes, unsigned long request, void *argp) {
request &= 0xFFFFFFFF; // needed on QCOM2
if (my_ioctl == NULL) my_ioctl = reinterpret_cast<decltype(my_ioctl)>(dlsym(RTLD_NEXT, "ioctl"));
Thneed *thneed = g_thneed;
// save the fd
if (request == IOCTL_KGSL_GPUOBJ_ALLOC) g_fd = filedes;
// note that this runs always, even without a thneed object
if (request == IOCTL_KGSL_DRAWCTXT_CREATE) {
struct kgsl_drawctxt_create *create = (struct kgsl_drawctxt_create *)argp;
create->flags &= ~KGSL_CONTEXT_PRIORITY_MASK;
create->flags |= 6 << KGSL_CONTEXT_PRIORITY_SHIFT; // priority from 1-15, 1 is max priority
printf("IOCTL_KGSL_DRAWCTXT_CREATE: creating context with flags 0x%x\n", create->flags);
}
if (thneed != NULL) {
if (request == IOCTL_KGSL_GPU_COMMAND) {
struct kgsl_gpu_command *cmd = (struct kgsl_gpu_command *)argp;
if (thneed->record) {
thneed->timestamp = cmd->timestamp;
thneed->context_id = cmd->context_id;
thneed->cmds.push_back(unique_ptr<CachedCommand>(new CachedCommand(thneed, cmd)));
}
if (thneed->debug >= 1) {
printf("IOCTL_KGSL_GPU_COMMAND(%2zu): flags: 0x%lx context_id: %u timestamp: %u numcmds: %d numobjs: %d\n",
thneed->cmds.size(),
cmd->flags,
cmd->context_id, cmd->timestamp, cmd->numcmds, cmd->numobjs);
}
} else if (request == IOCTL_KGSL_GPUOBJ_SYNC) {
struct kgsl_gpuobj_sync *cmd = (struct kgsl_gpuobj_sync *)argp;
struct kgsl_gpuobj_sync_obj *objs = (struct kgsl_gpuobj_sync_obj *)(cmd->objs);
if (thneed->debug >= 2) {
printf("IOCTL_KGSL_GPUOBJ_SYNC count:%d ", cmd->count);
for (int i = 0; i < cmd->count; i++) {
printf(" -- offset:0x%lx len:0x%lx id:%d op:%d ", objs[i].offset, objs[i].length, objs[i].id, objs[i].op);
}
printf("\n");
}
if (thneed->record) {
thneed->cmds.push_back(unique_ptr<CachedSync>(new
CachedSync(thneed, string((char *)objs, sizeof(struct kgsl_gpuobj_sync_obj)*cmd->count))));
}
} else if (request == IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID) {
struct kgsl_device_waittimestamp_ctxtid *cmd = (struct kgsl_device_waittimestamp_ctxtid *)argp;
if (thneed->debug >= 1) {
printf("IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID: context_id: %d timestamp: %d timeout: %d\n",
cmd->context_id, cmd->timestamp, cmd->timeout);
}
} else if (request == IOCTL_KGSL_SETPROPERTY) {
if (thneed->debug >= 1) {
struct kgsl_device_getproperty *prop = (struct kgsl_device_getproperty *)argp;
printf("IOCTL_KGSL_SETPROPERTY: 0x%x sizebytes:%zu\n", prop->type, prop->sizebytes);
if (thneed->debug >= 2) {
hexdump((uint8_t *)prop->value, prop->sizebytes);
if (prop->type == KGSL_PROP_PWR_CONSTRAINT) {
struct kgsl_device_constraint *constraint = (struct kgsl_device_constraint *)prop->value;
hexdump((uint8_t *)constraint->data, constraint->size);
}
}
}
} else if (request == IOCTL_KGSL_DRAWCTXT_CREATE || request == IOCTL_KGSL_DRAWCTXT_DESTROY) {
// this happens
} else if (request == IOCTL_KGSL_GPUOBJ_ALLOC || request == IOCTL_KGSL_GPUOBJ_FREE) {
// this happens
} else {
if (thneed->debug >= 1) {
printf("other ioctl %lx\n", request);
}
}
}
int ret = my_ioctl(filedes, request, argp);
// NOTE: This error message goes into stdout and messes up pyenv
// if (ret != 0) printf("ioctl returned %d with errno %d\n", ret, errno);
return ret;
}
}
// *********** GPUMalloc ***********
GPUMalloc::GPUMalloc(int size, int fd) {
struct kgsl_gpuobj_alloc alloc;
memset(&alloc, 0, sizeof(alloc));
alloc.size = size;
alloc.flags = 0x10000a00;
ioctl(fd, IOCTL_KGSL_GPUOBJ_ALLOC, &alloc);
void *addr = mmap64(NULL, alloc.mmapsize, 0x3, 0x1, fd, alloc.id*0x1000);
assert(addr != MAP_FAILED);
base = (uint64_t)addr;
remaining = size;
}
GPUMalloc::~GPUMalloc() {
// TODO: free the GPU malloced area
}
void *GPUMalloc::alloc(int size) {
void *ret = (void*)base;
size = (size+0xff) & (~0xFF);
assert(size <= remaining);
remaining -= size;
base += size;
return ret;
}
// *********** CachedSync, at the ioctl layer ***********
void CachedSync::exec() {
struct kgsl_gpuobj_sync cmd;
cmd.objs = (uint64_t)data.data();
cmd.obj_len = data.length();
cmd.count = data.length() / sizeof(struct kgsl_gpuobj_sync_obj);
int ret = ioctl(thneed->fd, IOCTL_KGSL_GPUOBJ_SYNC, &cmd);
assert(ret == 0);
}
// *********** CachedCommand, at the ioctl layer ***********
CachedCommand::CachedCommand(Thneed *lthneed, struct kgsl_gpu_command *cmd) {
thneed = lthneed;
assert(cmd->numsyncs == 0);
memcpy(&cache, cmd, sizeof(cache));
if (cmd->numcmds > 0) {
cmds = make_unique<struct kgsl_command_object[]>(cmd->numcmds);
memcpy(cmds.get(), (void *)cmd->cmdlist, sizeof(struct kgsl_command_object)*cmd->numcmds);
cache.cmdlist = (uint64_t)cmds.get();
for (int i = 0; i < cmd->numcmds; i++) {
void *nn = thneed->ram->alloc(cmds[i].size);
memcpy(nn, (void*)cmds[i].gpuaddr, cmds[i].size);
cmds[i].gpuaddr = (uint64_t)nn;
}
}
if (cmd->numobjs > 0) {
objs = make_unique<struct kgsl_command_object[]>(cmd->numobjs);
memcpy(objs.get(), (void *)cmd->objlist, sizeof(struct kgsl_command_object)*cmd->numobjs);
cache.objlist = (uint64_t)objs.get();
for (int i = 0; i < cmd->numobjs; i++) {
void *nn = thneed->ram->alloc(objs[i].size);
memset(nn, 0, objs[i].size);
objs[i].gpuaddr = (uint64_t)nn;
}
}
kq = thneed->ckq;
thneed->ckq.clear();
}
void CachedCommand::exec() {
cache.timestamp = ++thneed->timestamp;
int ret = ioctl(thneed->fd, IOCTL_KGSL_GPU_COMMAND, &cache);
if (thneed->debug >= 1) printf("CachedCommand::exec got %d\n", ret);
if (thneed->debug >= 2) {
for (auto &it : kq) {
it->debug_print(false);
}
}
assert(ret == 0);
}
// *********** Thneed ***********
Thneed::Thneed(bool do_clinit, cl_context _context) {
// TODO: QCOM2 actually requires a different context
//context = _context;
if (do_clinit) clinit();
assert(g_fd != -1);
fd = g_fd;
ram = make_unique<GPUMalloc>(0x80000, fd);
timestamp = -1;
g_thneed = this;
char *thneed_debug_env = getenv("THNEED_DEBUG");
debug = (thneed_debug_env != NULL) ? atoi(thneed_debug_env) : 0;
}
void Thneed::wait() {
struct kgsl_device_waittimestamp_ctxtid wait;
wait.context_id = context_id;
wait.timestamp = timestamp;
wait.timeout = -1;
uint64_t tb = nanos_since_boot();
int wret = ioctl(fd, IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID, &wait);
uint64_t te = nanos_since_boot();
if (debug >= 1) printf("wait %d after %lu us\n", wret, (te-tb)/1000);
}
void Thneed::execute(float **finputs, float *foutput, bool slow) {
uint64_t tb, te;
if (debug >= 1) tb = nanos_since_boot();
// ****** copy inputs
copy_inputs(finputs, true);
// ****** run commands
int i = 0;
for (auto &it : cmds) {
++i;
if (debug >= 1) printf("run %2d @ %7lu us: ", i, (nanos_since_boot()-tb)/1000);
it->exec();
if ((i == cmds.size()) || slow) wait();
}
// ****** copy outputs
copy_output(foutput);
if (debug >= 1) {
te = nanos_since_boot();
printf("model exec in %lu us\n", (te-tb)/1000);
}
}

View File

@@ -0,0 +1,76 @@
#include "selfdrive/modeld/transforms/loadyuv.h"
#include <cassert>
#include <cstdio>
#include <cstring>
void loadyuv_init(LoadYUVState* s, cl_context ctx, cl_device_id device_id, int width, int height) {
memset(s, 0, sizeof(*s));
s->width = width;
s->height = height;
char args[1024];
snprintf(args, sizeof(args),
"-cl-fast-relaxed-math -cl-denorms-are-zero "
"-DTRANSFORMED_WIDTH=%d -DTRANSFORMED_HEIGHT=%d",
width, height);
cl_program prg = cl_program_from_file(ctx, device_id, LOADYUV_PATH, args);
s->loadys_krnl = CL_CHECK_ERR(clCreateKernel(prg, "loadys", &err));
s->loaduv_krnl = CL_CHECK_ERR(clCreateKernel(prg, "loaduv", &err));
s->copy_krnl = CL_CHECK_ERR(clCreateKernel(prg, "copy", &err));
// done with this
CL_CHECK(clReleaseProgram(prg));
}
void loadyuv_destroy(LoadYUVState* s) {
CL_CHECK(clReleaseKernel(s->loadys_krnl));
CL_CHECK(clReleaseKernel(s->loaduv_krnl));
CL_CHECK(clReleaseKernel(s->copy_krnl));
}
void loadyuv_queue(LoadYUVState* s, cl_command_queue q,
cl_mem y_cl, cl_mem u_cl, cl_mem v_cl,
cl_mem out_cl) {
cl_int global_out_off = 0;
CL_CHECK(clSetKernelArg(s->loadys_krnl, 0, sizeof(cl_mem), &y_cl));
CL_CHECK(clSetKernelArg(s->loadys_krnl, 1, sizeof(cl_mem), &out_cl));
CL_CHECK(clSetKernelArg(s->loadys_krnl, 2, sizeof(cl_int), &global_out_off));
const size_t loadys_work_size = (s->width*s->height)/8;
CL_CHECK(clEnqueueNDRangeKernel(q, s->loadys_krnl, 1, NULL,
&loadys_work_size, NULL, 0, 0, NULL));
const size_t loaduv_work_size = ((s->width/2)*(s->height/2))/8;
global_out_off += (s->width*s->height);
CL_CHECK(clSetKernelArg(s->loaduv_krnl, 0, sizeof(cl_mem), &u_cl));
CL_CHECK(clSetKernelArg(s->loaduv_krnl, 1, sizeof(cl_mem), &out_cl));
CL_CHECK(clSetKernelArg(s->loaduv_krnl, 2, sizeof(cl_int), &global_out_off));
CL_CHECK(clEnqueueNDRangeKernel(q, s->loaduv_krnl, 1, NULL,
&loaduv_work_size, NULL, 0, 0, NULL));
global_out_off += (s->width/2)*(s->height/2);
CL_CHECK(clSetKernelArg(s->loaduv_krnl, 0, sizeof(cl_mem), &v_cl));
CL_CHECK(clSetKernelArg(s->loaduv_krnl, 1, sizeof(cl_mem), &out_cl));
CL_CHECK(clSetKernelArg(s->loaduv_krnl, 2, sizeof(cl_int), &global_out_off));
CL_CHECK(clEnqueueNDRangeKernel(q, s->loaduv_krnl, 1, NULL,
&loaduv_work_size, NULL, 0, 0, NULL));
}
void copy_queue(LoadYUVState* s, cl_command_queue q, cl_mem src, cl_mem dst,
size_t src_offset, size_t dst_offset, size_t size) {
CL_CHECK(clSetKernelArg(s->copy_krnl, 0, sizeof(cl_mem), &src));
CL_CHECK(clSetKernelArg(s->copy_krnl, 1, sizeof(cl_mem), &dst));
CL_CHECK(clSetKernelArg(s->copy_krnl, 2, sizeof(cl_int), &src_offset));
CL_CHECK(clSetKernelArg(s->copy_krnl, 3, sizeof(cl_int), &dst_offset));
const size_t copy_work_size = size/8;
CL_CHECK(clEnqueueNDRangeKernel(q, s->copy_krnl, 1, NULL,
&copy_work_size, NULL, 0, 0, NULL));
}

View File

@@ -0,0 +1,47 @@
#define UV_SIZE ((TRANSFORMED_WIDTH/2)*(TRANSFORMED_HEIGHT/2))
__kernel void loadys(__global uchar8 const * const Y,
__global uchar * out,
int out_offset)
{
const int gid = get_global_id(0);
const int ois = gid * 8;
const int oy = ois / TRANSFORMED_WIDTH;
const int ox = ois % TRANSFORMED_WIDTH;
const uchar8 ys = Y[gid];
// 02
// 13
__global uchar* outy0;
__global uchar* outy1;
if ((oy & 1) == 0) {
outy0 = out + out_offset; //y0
outy1 = out + out_offset + UV_SIZE*2; //y2
} else {
outy0 = out + out_offset + UV_SIZE; //y1
outy1 = out + out_offset + UV_SIZE*3; //y3
}
vstore4(ys.s0246, 0, outy0 + (oy/2) * (TRANSFORMED_WIDTH/2) + ox/2);
vstore4(ys.s1357, 0, outy1 + (oy/2) * (TRANSFORMED_WIDTH/2) + ox/2);
}
__kernel void loaduv(__global uchar8 const * const in,
__global uchar8 * out,
int out_offset)
{
const int gid = get_global_id(0);
const uchar8 inv = in[gid];
out[gid + out_offset / 8] = inv;
}
__kernel void copy(__global uchar8 * in,
__global uchar8 * out,
int in_offset,
int out_offset)
{
const int gid = get_global_id(0);
out[gid + out_offset / 8] = in[gid + in_offset / 8];
}

View File

@@ -0,0 +1,20 @@
#pragma once
#include "common/clutil.h"
typedef struct {
int width, height;
cl_kernel loadys_krnl, loaduv_krnl, copy_krnl;
} LoadYUVState;
void loadyuv_init(LoadYUVState* s, cl_context ctx, cl_device_id device_id, int width, int height);
void loadyuv_destroy(LoadYUVState* s);
void loadyuv_queue(LoadYUVState* s, cl_command_queue q,
cl_mem y_cl, cl_mem u_cl, cl_mem v_cl,
cl_mem out_cl);
void copy_queue(LoadYUVState* s, cl_command_queue q, cl_mem src, cl_mem dst,
size_t src_offset, size_t dst_offset, size_t size);

View File

@@ -0,0 +1,97 @@
#include "selfdrive/modeld/transforms/transform.h"
#include <cassert>
#include <cstring>
#include "common/clutil.h"
void transform_init(Transform* s, cl_context ctx, cl_device_id device_id) {
memset(s, 0, sizeof(*s));
cl_program prg = cl_program_from_file(ctx, device_id, TRANSFORM_PATH, "");
s->krnl = CL_CHECK_ERR(clCreateKernel(prg, "warpPerspective", &err));
// done with this
CL_CHECK(clReleaseProgram(prg));
s->m_y_cl = CL_CHECK_ERR(clCreateBuffer(ctx, CL_MEM_READ_WRITE, 3*3*sizeof(float), NULL, &err));
s->m_uv_cl = CL_CHECK_ERR(clCreateBuffer(ctx, CL_MEM_READ_WRITE, 3*3*sizeof(float), NULL, &err));
}
void transform_destroy(Transform* s) {
CL_CHECK(clReleaseMemObject(s->m_y_cl));
CL_CHECK(clReleaseMemObject(s->m_uv_cl));
CL_CHECK(clReleaseKernel(s->krnl));
}
void transform_queue(Transform* s,
cl_command_queue q,
cl_mem in_yuv, int in_width, int in_height, int in_stride, int in_uv_offset,
cl_mem out_y, cl_mem out_u, cl_mem out_v,
int out_width, int out_height,
const mat3& projection) {
const int zero = 0;
// sampled using pixel center origin
// (because that's how fastcv and opencv does it)
mat3 projection_y = projection;
// in and out uv is half the size of y.
mat3 projection_uv = transform_scale_buffer(projection, 0.5);
CL_CHECK(clEnqueueWriteBuffer(q, s->m_y_cl, CL_TRUE, 0, 3*3*sizeof(float), (void*)projection_y.v, 0, NULL, NULL));
CL_CHECK(clEnqueueWriteBuffer(q, s->m_uv_cl, CL_TRUE, 0, 3*3*sizeof(float), (void*)projection_uv.v, 0, NULL, NULL));
const int in_y_width = in_width;
const int in_y_height = in_height;
const int in_y_px_stride = 1;
const int in_uv_width = in_width/2;
const int in_uv_height = in_height/2;
const int in_uv_px_stride = 2;
const int in_u_offset = in_uv_offset;
const int in_v_offset = in_uv_offset + 1;
const int out_y_width = out_width;
const int out_y_height = out_height;
const int out_uv_width = out_width/2;
const int out_uv_height = out_height/2;
CL_CHECK(clSetKernelArg(s->krnl, 0, sizeof(cl_mem), &in_yuv)); // src
CL_CHECK(clSetKernelArg(s->krnl, 1, sizeof(cl_int), &in_stride)); // src_row_stride
CL_CHECK(clSetKernelArg(s->krnl, 2, sizeof(cl_int), &in_y_px_stride)); // src_px_stride
CL_CHECK(clSetKernelArg(s->krnl, 3, sizeof(cl_int), &zero)); // src_offset
CL_CHECK(clSetKernelArg(s->krnl, 4, sizeof(cl_int), &in_y_height)); // src_rows
CL_CHECK(clSetKernelArg(s->krnl, 5, sizeof(cl_int), &in_y_width)); // src_cols
CL_CHECK(clSetKernelArg(s->krnl, 6, sizeof(cl_mem), &out_y)); // dst
CL_CHECK(clSetKernelArg(s->krnl, 7, sizeof(cl_int), &out_y_width)); // dst_row_stride
CL_CHECK(clSetKernelArg(s->krnl, 8, sizeof(cl_int), &zero)); // dst_offset
CL_CHECK(clSetKernelArg(s->krnl, 9, sizeof(cl_int), &out_y_height)); // dst_rows
CL_CHECK(clSetKernelArg(s->krnl, 10, sizeof(cl_int), &out_y_width)); // dst_cols
CL_CHECK(clSetKernelArg(s->krnl, 11, sizeof(cl_mem), &s->m_y_cl)); // M
const size_t work_size_y[2] = {(size_t)out_y_width, (size_t)out_y_height};
CL_CHECK(clEnqueueNDRangeKernel(q, s->krnl, 2, NULL,
(const size_t*)&work_size_y, NULL, 0, 0, NULL));
const size_t work_size_uv[2] = {(size_t)out_uv_width, (size_t)out_uv_height};
CL_CHECK(clSetKernelArg(s->krnl, 2, sizeof(cl_int), &in_uv_px_stride)); // src_px_stride
CL_CHECK(clSetKernelArg(s->krnl, 3, sizeof(cl_int), &in_u_offset)); // src_offset
CL_CHECK(clSetKernelArg(s->krnl, 4, sizeof(cl_int), &in_uv_height)); // src_rows
CL_CHECK(clSetKernelArg(s->krnl, 5, sizeof(cl_int), &in_uv_width)); // src_cols
CL_CHECK(clSetKernelArg(s->krnl, 6, sizeof(cl_mem), &out_u)); // dst
CL_CHECK(clSetKernelArg(s->krnl, 7, sizeof(cl_int), &out_uv_width)); // dst_row_stride
CL_CHECK(clSetKernelArg(s->krnl, 8, sizeof(cl_int), &zero)); // dst_offset
CL_CHECK(clSetKernelArg(s->krnl, 9, sizeof(cl_int), &out_uv_height)); // dst_rows
CL_CHECK(clSetKernelArg(s->krnl, 10, sizeof(cl_int), &out_uv_width)); // dst_cols
CL_CHECK(clSetKernelArg(s->krnl, 11, sizeof(cl_mem), &s->m_uv_cl)); // M
CL_CHECK(clEnqueueNDRangeKernel(q, s->krnl, 2, NULL,
(const size_t*)&work_size_uv, NULL, 0, 0, NULL));
CL_CHECK(clSetKernelArg(s->krnl, 3, sizeof(cl_int), &in_v_offset)); // src_ofset
CL_CHECK(clSetKernelArg(s->krnl, 6, sizeof(cl_mem), &out_v)); // dst
CL_CHECK(clEnqueueNDRangeKernel(q, s->krnl, 2, NULL,
(const size_t*)&work_size_uv, NULL, 0, 0, NULL));
}

View File

@@ -0,0 +1,54 @@
#define INTER_BITS 5
#define INTER_TAB_SIZE (1 << INTER_BITS)
#define INTER_SCALE 1.f / INTER_TAB_SIZE
#define INTER_REMAP_COEF_BITS 15
#define INTER_REMAP_COEF_SCALE (1 << INTER_REMAP_COEF_BITS)
__kernel void warpPerspective(__global const uchar * src,
int src_row_stride, int src_px_stride, int src_offset, int src_rows, int src_cols,
__global uchar * dst,
int dst_row_stride, int dst_offset, int dst_rows, int dst_cols,
__constant float * M)
{
int dx = get_global_id(0);
int dy = get_global_id(1);
if (dx < dst_cols && dy < dst_rows)
{
float X0 = M[0] * dx + M[1] * dy + M[2];
float Y0 = M[3] * dx + M[4] * dy + M[5];
float W = M[6] * dx + M[7] * dy + M[8];
W = W != 0.0f ? INTER_TAB_SIZE / W : 0.0f;
int X = rint(X0 * W), Y = rint(Y0 * W);
int sx = convert_short_sat(X >> INTER_BITS);
int sy = convert_short_sat(Y >> INTER_BITS);
short sx_clamp = clamp(sx, 0, src_cols - 1);
short sx_p1_clamp = clamp(sx + 1, 0, src_cols - 1);
short sy_clamp = clamp(sy, 0, src_rows - 1);
short sy_p1_clamp = clamp(sy + 1, 0, src_rows - 1);
int v0 = convert_int(src[mad24(sy_clamp, src_row_stride, src_offset + sx_clamp*src_px_stride)]);
int v1 = convert_int(src[mad24(sy_clamp, src_row_stride, src_offset + sx_p1_clamp*src_px_stride)]);
int v2 = convert_int(src[mad24(sy_p1_clamp, src_row_stride, src_offset + sx_clamp*src_px_stride)]);
int v3 = convert_int(src[mad24(sy_p1_clamp, src_row_stride, src_offset + sx_p1_clamp*src_px_stride)]);
short ay = (short)(Y & (INTER_TAB_SIZE - 1));
short ax = (short)(X & (INTER_TAB_SIZE - 1));
float taby = 1.f/INTER_TAB_SIZE*ay;
float tabx = 1.f/INTER_TAB_SIZE*ax;
int dst_index = mad24(dy, dst_row_stride, dst_offset + dx);
int itab0 = convert_short_sat_rte( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE );
int itab1 = convert_short_sat_rte( (1.0f-taby)*tabx * INTER_REMAP_COEF_SCALE );
int itab2 = convert_short_sat_rte( taby*(1.0f-tabx) * INTER_REMAP_COEF_SCALE );
int itab3 = convert_short_sat_rte( taby*tabx * INTER_REMAP_COEF_SCALE );
int val = v0 * itab0 + v1 * itab1 + v2 * itab2 + v3 * itab3;
uchar pix = convert_uchar_sat((val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS);
dst[dst_index] = pix;
}
}

View File

@@ -0,0 +1,25 @@
#pragma once
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
#ifdef __APPLE__
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
#include "common/mat.h"
typedef struct {
cl_kernel krnl;
cl_mem m_y_cl, m_uv_cl;
} Transform;
void transform_init(Transform* s, cl_context ctx, cl_device_id device_id);
void transform_destroy(Transform* transform);
void transform_queue(Transform* s, cl_command_queue q,
cl_mem yuv, int in_width, int in_height, int in_stride, int in_uv_offset,
cl_mem out_y, cl_mem out_u, cl_mem out_v,
int out_width, int out_height,
const mat3& projection);

View File

@@ -21,6 +21,7 @@ async def verify_file(file_path: str, expected_hash: str) -> bool:
return sha256_hash.hexdigest().lower() == expected_hash.lower()
def get_active_bundle(params: Params) -> custom.ModelManagerSP.ModelBundle:
"""Gets the active model bundle from cache"""
if params is None:
@@ -30,3 +31,23 @@ def get_active_bundle(params: Params) -> custom.ModelManagerSP.ModelBundle:
return messaging.log_from_bytes(active_bundle, custom.ModelManagerSP.ModelBundle)
return None
def get_model_runner_by_filename(filename: str) -> custom.ModelManagerSP.Runner:
if filename.endswith(".thneed"):
return custom.ModelManagerSP.Runner.snpe
if filename.endswith("_tinygrad.pkl"):
return custom.ModelManagerSP.Runner.tinygrad
def get_active_model_runner(params: Params) -> custom.ModelManagerSP.Runner:
"""Gets the model runner from the active model bundle. If no active bundle, returns tinygrad"""
if params is None:
params = Params()
if active_bundle := get_active_bundle(params):
drive_model = next(model for model in active_bundle.models if model.type == custom.ModelManagerSP.Type.drive)
return get_model_runner_by_filename(drive_model.fileName)
return custom.ModelManagerSP.Runner.tinygrad

View File

@@ -1,10 +1,12 @@
import os
import operator
from cereal import car
from cereal import car, custom
from openpilot.common.params import Params
from openpilot.system.hardware import PC, TICI
from openpilot.system.manager.process import PythonProcess, NativeProcess, DaemonProcess
from sunnypilot.models.helpers import get_active_model_runner
from sunnypilot.sunnylink.utils import sunnylink_need_register, sunnylink_ready, use_sunnylink_uploader
WEBCAM = os.getenv("USE_WEBCAM") is not None
@@ -70,6 +72,15 @@ def use_sunnylink_uploader_shim(started, params, CP: car.CarParams) -> bool:
"""Shim for use_sunnylink_uploader to match the process manager signature."""
return use_sunnylink_uploader(params)
def is_snpe_model(started, params, CP: car.CarParams) -> bool:
"""Check if the active model runner is SNPE."""
# TODO-SP: I want to do a little more optimization here to only check this once when we've transitioned from offroad to onroad.
return bool(get_active_model_runner(params) == custom.ModelManagerSP.Runner.snpe)
def is_stock_model(started, params, CP: car.CarParams) -> bool:
"""Check if the active model runner is stock."""
return not is_snpe_model(started, params, CP)
def or_(*fns):
return lambda *args: operator.or_(*(fn(*args) for fn in fns))
@@ -92,7 +103,7 @@ procs = [
NativeProcess("stream_encoderd", "system/loggerd", ["./encoderd", "--stream"], notcar),
NativeProcess("loggerd", "system/loggerd", ["./loggerd"], logging),
# TODO Make python process once TG allows opening QCOM from child proc
NativeProcess("modeld", "selfdrive/modeld", ["./modeld"], only_onroad),
NativeProcess("modeld", "selfdrive/modeld", ["./modeld"], and_(only_onroad, is_stock_model)),
NativeProcess("sensord", "system/sensord", ["./sensord"], only_onroad, enabled=not PC),
NativeProcess("ui", "selfdrive/ui", ["./ui"], always_run, watchdog_max_dt=(5 if not PC else None)),
PythonProcess("soundd", "selfdrive.ui.soundd", only_onroad),
@@ -134,6 +145,7 @@ procs = [
# sunnypilot
procs += [
PythonProcess("models_manager", "sunnypilot.models.manager", only_offroad),
NativeProcess("modeld_snpe", "sunnypilot/modeld", ["./modeld"], and_(only_onroad, is_snpe_model)),
]
if os.path.exists("./github_runner.sh"):