From cbc88b461d165b825c3fb67b4c4a17eef8dd904a Mon Sep 17 00:00:00 2001 From: Calvin Park Date: Wed, 15 Apr 2026 20:29:22 -0700 Subject: [PATCH] Better flash --- CLAUDE.md | 17 +++++++++ tsk/common/extractor.py | 85 +++++++++++++++++++++++++++++++++++++++-- 2 files changed, 99 insertions(+), 3 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index ca47a7fe..e5299df2 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -79,3 +79,20 @@ Current status: **fixed and verified end-to-end in car**. ### 2026-04-10 Two users reported successful key extraction using `calvinpark/tskm` — fix confirmed in the wild. Plan: squash the tskm branch to a single commit and push to `optskug/tskm` to ship it. + +### 2026-04-15 +Third user hit the same `can_version=0, library v1974202998` error after the 2026-04-09 fix shipped to `optskug/tskm`. Root-cause investigation revealed a harder failure mode. + +**What happened**: The user's device had `DEV-18392c3e-RELEASE` firmware. `panda.flash()` ran without error — but the firmware string and `up_to_date: False` were identical before and after. The SPI bulk writes were accepted without error but didn't persist. Immediately re-checking showed the same DEV firmware. + +**Why bare `panda.flash()` worked before but not here**: Previous users' firmware was overwritable via SPI. This device's DEV firmware resists SPI writes — possibly flash write protection, a hardware difference, or something specific to DEV builds. We can't prove it from the outside. What we know: the same scenario is exactly what pandad handles with its GPIO recovery path. + +**How the user was unblocked**: Installed `commaai/nightly-dev` on the device, which let pandad run and flash panda successfully via its full recovery sequence (GPIO BOOT0 HIGH → hardware DFU mode). Then ran TSKM from bare clone. This confirmed pandad's GPIO path could flash the device when bare `panda.flash()` could not. + +**Fix**: Replaced `panda = Panda(); panda.flash()` with `flash_panda()` from `selfdrive/pandad/pandad.py`. pandad's sequence adds: (1) `HARDWARE.recover_internal_panda()` on `PandaProtocolMismatch` (asserts BOOT0 HIGH during reset, forcing STM32 into hardware DFU mode), (2) `panda.recover()` (flashes bootstub via USB DFU), then reflashes the main app. The GPIO path bypasses whatever blocks SPI writes on resistant firmware. + +**Code changes** (`tsk/common/extractor.py`): +- Added `class PandaError(Exception)` — hardware precondition failure, not retryable (falls through to "Unexpected error" handler, not `RetryError`) +- Extracted flash logic to `TSKExtractor._connect_and_flash_panda()` staticmethod with full historical commentary (background, original fix, new failure, hypothesis, fix rationale, source attribution) +- `hack()` call site: `panda = cls._connect_and_flash_panda()` +- New imports: `from panda import PandaProtocolMismatch`, `from selfdrive.pandad.pandad import flash_panda` diff --git a/tsk/common/extractor.py b/tsk/common/extractor.py index 2f3f153c..2e8a59fb 100644 --- a/tsk/common/extractor.py +++ b/tsk/common/extractor.py @@ -10,7 +10,8 @@ from opendbc.car.isotp import isotp_send from opendbc.car.structs import CarParams from opendbc.car.uds import UdsClient, ACCESS_TYPE, SESSION_TYPE, DATA_IDENTIFIER_TYPE, SERVICE_TYPE, \ ROUTINE_CONTROL_TYPE, InvalidServiceIdError, MessageTimeoutError, NegativeResponseError -from panda import Panda +from panda import Panda, PandaProtocolMismatch +from selfdrive.pandad.pandad import flash_panda from tsk.common.env import is_agnos, PAYLOAD_PATH @@ -31,6 +32,10 @@ class RetryError(Exception): return f"{self.message}\n\nTry again. If the problem persists, turn off the car, put it back into 'Not Ready to Drive' mode, and then try again." +class PandaError(Exception): + pass + + def format_version_for_error_display(version1, version2=None, length=8): version_str = "" @@ -73,6 +78,81 @@ class TSKExtractor: SECOC_KEY_SIZE = 0x10 SECOC_KEY_OFFSET = 0x0c + @staticmethod + def _connect_and_flash_panda() -> Panda: + """ + Connects to the panda and ensures its firmware is up to date before the extractor runs. + + ## Background + + TSKM never starts pandad — launch_chffrplus.sh runs tsk/main.py directly, bypassing + manager.py entirely. In normal openpilot, manager.py starts pandad on every boot, and + pandad's first job is to flash the panda firmware if it's out of date. Since TSKM skips + that whole stack, the panda arrives at the extractor with whatever firmware it had before. + + ## The Original Fix (April 2026) + + Users were hitting: RuntimeError: CAN packet version mismatch: panda's firmware v0, + library v1974202998. + + The panda library checks can_version (read from the panda's 0xdd endpoint) against + CAN_PACKET_VERSION (computed from can.h at runtime) before allowing any CAN send. + Firmware that predates the versioning scheme returns 0 bytes for 0xdd, so can_version=0. + + Fix: call panda.flash() before the extractor runs. panda.flash() compares the firmware + binary signature and reflashes if it doesn't match. This worked for all users at the time. + + ## The New Failure (April 2026, same month) + + A user with firmware "DEV-18392c3e-RELEASE" reported the same can_version=0 error. When + we ran panda.flash() manually via SSH and then immediately re-checked, the firmware was + STILL "DEV-18392c3e-RELEASE" and up_to_date was STILL False. The flash appeared to run + (flash: unlocking → erasing → flashing → resetting) but nothing changed. + + ## Hypothesis + + panda.flash() sends flash data via SPI bulk write. On the working devices, the firmware + being replaced accepted those writes and the new firmware booted cleanly. On this device, + the SPI writes are accepted without error but don't persist — the panda comes back up on + the same DEV firmware after every reset. We don't know exactly why (flash write protection, + a hardware difference in this unit, something specific to DEV builds). What we do know is + that this is exactly the scenario pandad handles with its GPIO recovery path. + + ## The Fix + + Use pandad's flash_panda() instead of bare panda.flash(). pandad's sequence is: + 1. Connect. If PandaProtocolMismatch, call HARDWARE.recover_internal_panda() (GPIO: assert + BOOT0 HIGH during reset, forcing the STM32 into hardware DFU mode) and retry. + 2. Call panda.flash(). If firmware still won't boot (still in bootstub after flash): + 3. For internal pandas (C3X, C4): call HARDWARE.recover_internal_panda() again, then + panda.recover() which flashes the bootstub via USB DFU, then reflashes the main app. + 4. Verify signature matches expected firmware. Raise if not. + + The GPIO path (step 3) is what bare panda.flash() is missing. By asserting BOOT0 HIGH at + the hardware level, it bypasses whatever was blocking the SPI writes and forces a clean + DFU-mode flash over USB. + + Source: selfdrive/pandad/pandad.py, flash_panda() lines 24-61 and main() lines 106-115, 147. + """ + panda_serials = Panda.list() + if not panda_serials: + raise PandaError("No panda found") + + for _ in range(3): + try: + return flash_panda(panda_serials[0]) + except PandaProtocolMismatch: + # flash_panda() already called HARDWARE.recover_internal_panda() before re-raising; + # wait for the panda to come back up and retry + time.sleep(3) + panda_serials = Panda.list() + if not panda_serials: + raise PandaError("No panda found after protocol recovery") + except AssertionError: + raise PandaError("Panda firmware update failed") + + raise PandaError("Panda protocol mismatch persists after recovery") + @classmethod def _get_key_struct(cls, data, key_no): return data[key_no * cls.KEY_STRUCT_SIZE: (key_no + 1) * cls.KEY_STRUCT_SIZE] @@ -103,8 +183,7 @@ class TSKExtractor: except FileNotFoundError: pass - panda = Panda() - panda.flash() # no-op if firmware is already up to date; required because TSKM kills pandad before it can flash + panda = cls._connect_and_flash_panda() panda.set_safety_mode(CarParams.SafetyModel.elm327) uds_client = UdsClient(panda, cls.ADDR, cls.ADDR + 8, cls.BUS, timeout=0.1, response_pending_timeout=0.1)