Compare commits

...

8 Commits

Author SHA1 Message Date
DevTekVE
82d67bba87 Very agressive performance boost for rreplay on device 2024-12-29 09:18:11 +01:00
DevTekVE
341b92176e Fix inconsistent indentation in numpy_inputs initialization
Corrected the indentation within the loop for numpy_inputs assignment in `modeld.py`. This ensures proper execution of the loop and prevents potential runtime issues caused by misaligned code blocks.

Refactor TINYGRAD usage logic and simplify checks.

Consolidated the TINYGRAD flag with TICI conditions to reduce redundancy. Adjusted tensor initialization flow to handle different device setups more cleanly. This simplifies the code and improves maintainability.

Enable tinygrad integration via environment variable

Added support for using tinygrad on non-TICI devices by introducing the `USE_TINYGRAD` environment variable. Conditional logic was updated to accommodate this change, ensuring compatibility with both tinygrad and ONNX runtimes. This allows more flexibility in choosing the computation framework.
2024-12-28 20:43:28 +01:00
DevTekVE
d9c22271d6 Refactor input handling and add support for curvature outputs
Improved ONNX model input metadata handling and typecasting for numpy inputs. Added support for desired curvature outputs and lateral control parameters in the model data flow. Updated input preparation logic to enhance flexibility and maintain compatibility with dynamic inputs.
2024-12-28 17:44:06 +01:00
DevTekVE
9dc961ab0a Fix tensor input initialization with correct dtype
Updated tensor initialization to fetch the correct dtype from the model's expected variables. This ensures compatibility with varying input data types and avoids potential runtime errors.
2024-12-28 13:04:47 +01:00
DevTekVE
42d9c14515 Fix numpy_inputs initialization for non-OpenCL-managed keys
Previously, numpy_inputs were always initialized regardless of OpenCL management. The update ensures numpy_inputs are created only for keys not handled by OpenCL, avoiding unnecessary initialization.

make it stock
2024-12-28 12:06:53 +01:00
DevTekVE
35fbeaf9e2 Update tensor input initialization for TICI models
Refactored tensor input setup to dynamically adapt to model expectations, including dtype and device alignment based on captured attributes. This ensures compatibility and correct processing for TICI-based models.
2024-12-28 01:50:59 +01:00
DevTekVE
e23e078c5b Refactor model input indexing for clarity and efficiency.
Replaced hardcoded indexing with precomputed indices for feature buffers and desire reshape dimensions. This improves code readability, reduces redundancy, and ensures consistent array slicing throughout the model pipeline.
2024-12-27 18:42:56 +01:00
DevTekVE
deaf0c485c Get the model input and prepare the numpy array from the metadata 2024-12-27 17:07:11 +01:00
2 changed files with 68 additions and 18 deletions

View File

@@ -3,13 +3,14 @@ import os
from openpilot.system.hardware import TICI
#
if TICI:
USE_TINYGRAD = os.getenv('USE_TINYGRAD', True) or TICI
if USE_TINYGRAD:
from tinygrad.tensor import Tensor
from tinygrad.dtype import dtypes
from openpilot.selfdrive.modeld.runners.tinygrad_helpers import qcom_tensor_from_opencl_address
os.environ['QCOM'] = '1'
else:
from openpilot.selfdrive.modeld.runners.ort_helpers import make_onnx_cpu_runner
from openpilot.selfdrive.modeld.runners.ort_helpers import make_onnx_cpu_runner, ORT_TYPES_TO_NP_TYPES
import time
import pickle
import numpy as np
@@ -60,30 +61,36 @@ class ModelState:
self.frames = {'input_imgs': DrivingModelFrame(context), 'big_input_imgs': DrivingModelFrame(context)}
self.prev_desire = np.zeros(ModelConstants.DESIRE_LEN, dtype=np.float32)
self.full_features_20Hz = np.zeros((ModelConstants.FULL_HISTORY_BUFFER_LEN, ModelConstants.FEATURE_LEN), dtype=np.float32)
self.desire_20Hz = np.zeros((ModelConstants.FULL_HISTORY_BUFFER_LEN + 1, ModelConstants.DESIRE_LEN), dtype=np.float32)
self.desire_20Hz = np.zeros((ModelConstants.FULL_HISTORY_BUFFER_LEN + 1, ModelConstants.DESIRE_LEN), dtype=np.float32)
# img buffers are managed in openCL transform code
self.numpy_inputs = {
'desire': np.zeros((1, (ModelConstants.HISTORY_BUFFER_LEN+1), ModelConstants.DESIRE_LEN), dtype=np.float32),
'traffic_convention': np.zeros((1, ModelConstants.TRAFFIC_CONVENTION_LEN), dtype=np.float32),
'features_buffer': np.zeros((1, ModelConstants.HISTORY_BUFFER_LEN, ModelConstants.FEATURE_LEN), dtype=np.float32),
}
self.numpy_inputs = {}
with open(METADATA_PATH, 'rb') as f:
model_metadata = pickle.load(f)
self.input_shapes = model_metadata['input_shapes']
for key, shape in self.input_shapes.items():
if key not in self.frames: # Managed by opencl
self.numpy_inputs[key] = np.zeros(shape, dtype=np.float32)
self.output_slices = model_metadata['output_slices']
net_output_size = model_metadata['output_shapes']['outputs'][1]
self.output = np.zeros(net_output_size, dtype=np.float32)
self.parser = Parser()
if TICI:
if USE_TINYGRAD:
self.tensor_inputs = {k: Tensor(v, device='NPY').realize() for k,v in self.numpy_inputs.items()}
with open(MODEL_PKL_PATH, "rb") as f:
self.model_run = pickle.load(f)
else:
self.onnx_cpu_runner = make_onnx_cpu_runner(MODEL_PATH)
self.onnx_model_metadata = {input.name: input.type for input in self.onnx_cpu_runner.get_inputs()}
num_elements = self.numpy_inputs['features_buffer'].shape[1]
step_size = int(-100 / num_elements)
self.full_features_20Hz_idxs = np.arange(step_size, step_size * (num_elements + 1), step_size)[::-1]
self.desire_reshape_dims = (self.numpy_inputs['desire'].shape[0], self.numpy_inputs['desire'].shape[1], -1, self.numpy_inputs['desire'].shape[2])
def slice_outputs(self, model_outputs: np.ndarray) -> dict[str, np.ndarray]:
parsed_model_outputs = {k: model_outputs[np.newaxis, v] for k,v in self.output_slices.items()}
@@ -100,25 +107,32 @@ class ModelState:
self.desire_20Hz[:-1] = self.desire_20Hz[1:]
self.desire_20Hz[-1] = new_desire
self.numpy_inputs['desire'][:] = self.desire_20Hz.reshape((1,25,4,-1)).max(axis=2)
self.numpy_inputs['desire'][:] = self.desire_20Hz.reshape(self.desire_reshape_dims).max(axis=2)
self.numpy_inputs['traffic_convention'][:] = inputs['traffic_convention']
imgs_cl = {'input_imgs': self.frames['input_imgs'].prepare(buf, transform.flatten()),
'big_input_imgs': self.frames['big_input_imgs'].prepare(wbuf, transform_wide.flatten())}
if TICI:
if USE_TINYGRAD:
# The imgs tensors are backed by opencl memory, only need init once
for key in imgs_cl:
if key not in self.tensor_inputs:
self.tensor_inputs[key] = qcom_tensor_from_opencl_address(imgs_cl[key].mem_address, self.input_shapes[key], dtype=dtypes.uint8)
if not TICI or key not in self.tensor_inputs:
index = self.model_run.captured.expected_names.index(key)
_, _, dtype, device = self.model_run.captured.expected_st_vars_dtype_device[index]
if TICI:
self.tensor_inputs[key] = qcom_tensor_from_opencl_address(imgs_cl[key].mem_address, self.input_shapes[key], dtype=dtype)
else:
shape = self.frames[key].buffer_from_cl(imgs_cl[key]).reshape(self.input_shapes[key])
self.tensor_inputs[key] = Tensor(shape, device=device, dtype=dtype).realize()
else:
for key in imgs_cl:
self.numpy_inputs[key] = self.frames[key].buffer_from_cl(imgs_cl[key]).reshape(self.input_shapes[key])
dtype = self.onnx_model_metadata[key]
self.numpy_inputs[key] = self.frames[key].buffer_from_cl(imgs_cl[key]).astype(ORT_TYPES_TO_NP_TYPES[dtype]).reshape(self.input_shapes[key])
if prepare_only:
return None
if TICI:
if USE_TINYGRAD:
self.output = self.model_run(**self.tensor_inputs).numpy().flatten()
else:
self.output = self.onnx_cpu_runner.run(None, self.numpy_inputs)[0].flatten()
@@ -128,8 +142,15 @@ class ModelState:
self.full_features_20Hz[:-1] = self.full_features_20Hz[1:]
self.full_features_20Hz[-1] = outputs['hidden_state'][0, :]
idxs = np.arange(-4,-100,-4)[::-1]
self.numpy_inputs['features_buffer'][:] = self.full_features_20Hz[idxs]
self.numpy_inputs['features_buffer'][:] = self.full_features_20Hz[self.full_features_20Hz_idxs]
if "desired_curvature" in outputs:
if "prev_desired_curvs" in self.numpy_inputs.keys():
self.numpy_inputs['prev_desired_curvs'][:-1] = self.numpy_inputs['prev_desired_curvs'][1:]
self.numpy_inputs['prev_desired_curvs'][-1] = outputs['desired_curvature'][:, 0:1, None] # Reshape to (1,1,1)
if "prev_desired_curv" in self.numpy_inputs.keys():
# First shift everything
self.numpy_inputs['prev_desired_curv'][:-ModelConstants.PREV_DESIRED_CURV_LEN] = self.numpy_inputs['prev_desired_curv'][ModelConstants.PREV_DESIRED_CURV_LEN:]
self.numpy_inputs['prev_desired_curv'][-ModelConstants.PREV_DESIRED_CURV_LEN:] = outputs['desired_curvature'][:, :1].reshape(1, -1, 1)
return outputs
@@ -240,6 +261,10 @@ def main(demo=False):
is_rhd = sm["driverMonitoringState"].isRHD
frame_id = sm["roadCameraState"].frameId
v_ego = max(sm["carState"].vEgo, 0.)
lateral_control_params = None #TODO-SP: hardcoded ,this shouldnt' be here this way. We should do it more dynamically
if "lateral_control_params" in model.numpy_inputs.keys(): #TODO-SP: hardcoded ,this shouldnt' be here this way. We should do it more dynamically
lateral_control_params = np.array([sm["carState"].vEgo, steer_delay], dtype=np.float32)
if sm.updated["liveCalibration"] and sm.seen['roadCameraState'] and sm.seen['deviceState']:
device_from_calib_euler = np.array(sm["liveCalibration"].rpyCalib, dtype=np.float32)
dc = DEVICE_CAMERAS[(str(sm['deviceState'].deviceType), str(sm['roadCameraState'].sensor))]
@@ -271,6 +296,8 @@ def main(demo=False):
'desire': vec_desire,
'traffic_convention': traffic_convention,
}
if "lateral_control_params" in model.numpy_inputs.keys():
inputs['lateral_control_params'] = lateral_control_params
mt1 = time.perf_counter()
model_output = model.run(buf_main, buf_extra, model_transform_main, model_transform_extra, inputs, prepare_only)

View File

@@ -119,7 +119,13 @@ VideoDecoder::~VideoDecoder() {
}
bool VideoDecoder::open(AVCodecParameters *codecpar, bool hw_decoder) {
const AVCodec *decoder = avcodec_find_decoder(codecpar->codec_id);
const AVCodec *decoder = avcodec_find_decoder_by_name("h264_mediacodec");
if (!decoder) {
decoder = avcodec_find_decoder_by_name("h264_qcom");
}
if (!decoder) {
decoder = avcodec_find_decoder(codecpar->codec_id);
}
if (!decoder) return false;
decoder_ctx = avcodec_alloc_context3(decoder);
@@ -127,6 +133,23 @@ bool VideoDecoder::open(AVCodecParameters *codecpar, bool hw_decoder) {
rError("Failed to allocate or initialize codec context");
return false;
}
// More aggressive settings focused on reducing lag
decoder_ctx->thread_count = static_cast<int>(std::min(std::thread::hardware_concurrency(), 16u));
decoder_ctx->thread_type = FF_THREAD_FRAME | FF_THREAD_SLICE;
// Very aggressive frame dropping
decoder_ctx->flags |= AV_CODEC_FLAG_LOW_DELAY;
decoder_ctx->flags2 |= AV_CODEC_FLAG2_FAST;
decoder_ctx->skip_frame = AVDISCARD_BIDIR; // More aggressive frame skipping
decoder_ctx->skip_loop_filter = AVDISCARD_ALL;
decoder_ctx->workaround_bugs = FF_BUG_AUTODETECT;
// Minimize buffering
decoder_ctx->max_b_frames = 0;
decoder_ctx->strict_std_compliance = FF_COMPLIANCE_UNOFFICIAL; // Allow faster non-standard optimizations
decoder_ctx->flags |= AV_CODEC_FLAG_OUTPUT_CORRUPT; // Output frames even if slightly corrupted
width = (decoder_ctx->width + 3) & ~3;
height = decoder_ctx->height;