diff --git a/starpilot/system/the_galaxy/assets/components/tools/update_manager.css b/starpilot/system/the_galaxy/assets/components/tools/update_manager.css index a71ce59a4..b504b5927 100644 --- a/starpilot/system/the_galaxy/assets/components/tools/update_manager.css +++ b/starpilot/system/the_galaxy/assets/components/tools/update_manager.css @@ -136,6 +136,17 @@ font-weight: var(--font-weight-bold); } +.updateRecovery { + border-left: 3px solid var(--main-fg); + margin-top: var(--margin-base); + padding-left: var(--padding-base); +} + +.updateRecovery p { + font-size: var(--font-size-sm); + margin: var(--margin-xs) 0 var(--margin-sm); +} + .updateFooter { border-top: 1px solid var(--track-color); margin-top: var(--margin-base); diff --git a/starpilot/system/the_galaxy/assets/components/tools/update_manager.js b/starpilot/system/the_galaxy/assets/components/tools/update_manager.js index a212bf9e6..c0b3ca19d 100644 --- a/starpilot/system/the_galaxy/assets/components/tools/update_manager.js +++ b/starpilot/system/the_galaxy/assets/components/tools/update_manager.js @@ -16,6 +16,7 @@ const state = reactive({ branchesBusy: false, switchBusy: false, rollbackBusy: false, + recoveryBusy: false, }) let initialized = false @@ -128,6 +129,7 @@ function hasRecordedRollbackTarget() { function shouldShowPrimaryUpdateAction() { if (state.status?.running) return true + if (state.status?.interruptedUpdateRecovery?.detected) return false if (isSelectedBranchDifferent()) return true return !!state.checkedForUpdates && !!state.status?.updateAvailable } @@ -137,7 +139,10 @@ function isFactoryResetStatusActive() { } function shouldContinuePolling() { - return !!state.status?.running || state.status?.stage === "rebooting" || reconnectPending + return !!state.status?.running + || state.status?.stage === "rebooting" + || !!state.status?.interruptedUpdateRecovery?.detected + || reconnectPending } function shouldShowRebootNotice() { @@ -372,7 +377,7 @@ async function setAutomaticUpdates(enabled) { } } -async function runFastUpdate() { +async function runFastUpdate(skipConfirmation = false) { if (state.updateBusy) return if (state.status?.running) { showSnackbar("Fast update is already running.") @@ -383,13 +388,15 @@ async function runFastUpdate() { return } - const confirmed = window.confirm( - "Fast update warning:\n\n" + - "- This update method skips backup creation.\n" + - "- Your device will reboot when the update is done.\n\n" + - "Continue with fast update?" - ) - if (!confirmed) return + if (!skipConfirmation) { + const confirmed = window.confirm( + "Fast update warning:\n\n" + + "- This update method skips backup creation.\n" + + "- Your device will reboot when the update is done.\n\n" + + "Continue with fast update?" + ) + if (!confirmed) return + } state.updateBusy = true try { @@ -425,7 +432,7 @@ async function runFastUpdate() { } } -async function runBranchSwitch() { +async function runBranchSwitch(skipConfirmation = false) { if (state.switchBusy) return if (state.status?.running) { showSnackbar("An update action is already running.") @@ -444,13 +451,15 @@ async function runBranchSwitch() { const currentBranch = String(state.status?.branch || "").trim() const actionLabel = currentBranch && currentBranch === targetBranch ? "update" : "switch and update" - const confirmed = window.confirm( - `This will ${actionLabel} to the '${targetBranch}' branch.\n\n` + - "- This update method skips backup creation.\n" + - "- Your device will reboot when the update is done.\n\n" + - "Continue?" - ) - if (!confirmed) return + if (!skipConfirmation) { + const confirmed = window.confirm( + `This will ${actionLabel} to the '${targetBranch}' branch.\n\n` + + "- This update method skips backup creation.\n" + + "- Your device will reboot when the update is done.\n\n" + + "Continue?" + ) + if (!confirmed) return + } state.switchBusy = true try { @@ -490,7 +499,7 @@ async function runBranchSwitch() { } } -async function runRollback() { +async function runRollback(skipConfirmation = false) { if (state.rollbackBusy) return if (state.status?.running) { showSnackbar("An update action is already running.") @@ -508,15 +517,17 @@ async function runRollback() { return } - const confirmed = window.confirm( - "Roll back to the previous installed version?\n\n" + - `Target: ${rollbackBranch || "Unknown"} @ ${shortHash(rollbackCommit)}\n\n` + - "- This restores the version this device was running before the last Galaxy update.\n" + - "- Automatic updates will be turned off.\n" + - "- Your device will reboot when the rollback is done.\n\n" + - "Continue?" - ) - if (!confirmed) return + if (!skipConfirmation) { + const confirmed = window.confirm( + "Roll back to the previous installed version?\n\n" + + `Target: ${rollbackBranch || "Unknown"} @ ${shortHash(rollbackCommit)}\n\n` + + "- This restores the version this device was running before the last Galaxy update.\n" + + "- Automatic updates will be turned off.\n" + + "- Your device will reboot when the rollback is done.\n\n" + + "Continue?" + ) + if (!confirmed) return + } state.rollbackBusy = true try { @@ -554,6 +565,70 @@ async function runRollback() { } } +async function retryInterruptedUpdate() { + if (state.recoveryBusy) return + if (state.status?.isOnroad) { + showSnackbar("Actions are blocked while onroad.", "error") + return + } + + const recovery = state.status?.interruptedUpdateRecovery || {} + if (!recovery.canRecover) { + showSnackbar(recovery.reason || "This update cannot be recovered safely yet.", "error") + return + } + + const confirmed = window.confirm( + "Retry the interrupted update safely?\n\n" + + "Galaxy will verify that the vehicle is parked and no update or Git process is active. " + + "It will clear only the abandoned shallow-update lock, then retry the previous update action." + ) + if (!confirmed) return + + const previousMode = String(state.status?.lastMode || "").trim() + const previousBranch = String(state.status?.lastBranch || "").trim() + state.recoveryBusy = true + try { + const response = await fetch("/api/update/recover", { method: "POST" }) + const payload = await readJsonPayload(response) + if (!response.ok) { + if (payload.interruptedUpdateRecovery) { + state.status = { + ...(state.status || {}), + interruptedUpdateRecovery: payload.interruptedUpdateRecovery, + } + } + throw new Error(payload.error || response.statusText || "Failed to recover interrupted update") + } + + state.status = { + ...(state.status || {}), + running: false, + stage: "idle", + message: payload.message || "Interrupted update recovered. Retrying now...", + lastError: "", + interruptedUpdateRecovery: payload.interruptedUpdateRecovery || { detected: false, canRecover: false }, + } + state.error = "" + showSnackbar(payload.message || "Interrupted update recovered. Retrying now...") + + if (previousMode === "branch-switch" && previousBranch) { + state.selectedBranch = previousBranch + state.hasManualBranchSelection = true + await runBranchSwitch(true) + } else if (previousMode === "rollback" && state.status?.rollbackAvailable) { + await runRollback(true) + } else { + await runFastUpdate(true) + } + } catch (error) { + showSnackbar(error?.message || "Failed to recover interrupted update", "error") + await fetchStatus(false) + } finally { + state.recoveryBusy = false + } +} + function initialize() { if (initialized) return initialized = true @@ -696,6 +771,19 @@ export function UpdateManager() { ${() => !isFactoryResetStatusActive() && state.status?.lastError ? html`
Last Error: ${state.status.lastError}
` : ""} ${() => state.error ? html`Error: ${state.error}
` : ""} + ${() => !state.status?.running && state.status?.interruptedUpdateRecovery?.detected ? html` +${state.status.interruptedUpdateRecovery.reason || "Galaxy found a shallow-update lock left by an interrupted update."}
+ +Run Check for Updates first, or select a different branch in advanced options.
` : ""} ${() => shouldShowRebootNotice() diff --git a/starpilot/system/the_galaxy/tests/test_update_recovery.py b/starpilot/system/the_galaxy/tests/test_update_recovery.py new file mode 100644 index 000000000..18f3e1d4f --- /dev/null +++ b/starpilot/system/the_galaxy/tests/test_update_recovery.py @@ -0,0 +1,159 @@ +import importlib.util +import os + +from pathlib import Path + + +def _load_update_recovery_module(): + module_path = Path(__file__).resolve().parents[1] / "update_recovery.py" + spec = importlib.util.spec_from_file_location("update_recovery_under_test", module_path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +update_recovery = _load_update_recovery_module() + + +def _make_repo(tmp_path, *, lock_age=60.0, now=1_000.0): + repo_path = tmp_path / "openpilot" + git_dir = repo_path / ".git" + git_dir.mkdir(parents=True) + lock_path = git_dir / update_recovery.SHALLOW_LOCK_NAME + lock_path.write_text("", encoding="utf-8") + os.utime(lock_path, (now - lock_age, now - lock_age)) + proc_root = tmp_path / "proc" + proc_root.mkdir() + return repo_path, lock_path, proc_root + + +def _inspect(repo_path, proc_root, *, now=1_000.0, is_onroad=False, update_running=False, updater_state="idle"): + return update_recovery.inspect_interrupted_update( + repo_path, + is_onroad=is_onroad, + update_running=update_running, + updater_state=updater_state, + now=now, + proc_root=proc_root, + ) + + +def test_missing_lock_does_not_offer_recovery(tmp_path): + repo_path = tmp_path / "openpilot" + (repo_path / ".git").mkdir(parents=True) + proc_root = tmp_path / "proc" + proc_root.mkdir() + + status = _inspect(repo_path, proc_root) + + assert status["detected"] is False + assert status["canRecover"] is False + + +def test_recent_lock_waits_before_recovery(tmp_path): + repo_path, _, proc_root = _make_repo(tmp_path, lock_age=10.0) + + status = _inspect(repo_path, proc_root) + + assert status["detected"] is True + assert status["canRecover"] is False + assert "Waiting" in status["reason"] + + +def test_recovery_is_blocked_onroad_or_while_updater_is_busy(tmp_path): + repo_path, _, proc_root = _make_repo(tmp_path) + + onroad_status = _inspect(repo_path, proc_root, is_onroad=True) + updater_status = _inspect(repo_path, proc_root, updater_state="downloading...") + galaxy_status = _inspect(repo_path, proc_root, update_running=True) + + assert onroad_status["canRecover"] is False + assert "parked" in onroad_status["reason"] + assert updater_status["canRecover"] is False + assert "downloading" in updater_status["reason"] + assert galaxy_status["canRecover"] is False + assert "still running" in galaxy_status["reason"] + + +def test_active_repo_git_process_blocks_recovery(tmp_path): + repo_path, _, proc_root = _make_repo(tmp_path) + process_dir = proc_root / "123" + process_dir.mkdir() + (process_dir / "comm").write_text("git\n", encoding="utf-8") + (process_dir / "cmdline").write_bytes(b"/usr/bin/git\0fetch\0origin\0") + (process_dir / "cwd").symlink_to(repo_path) + + status = _inspect(repo_path, proc_root) + + assert status["canRecover"] is False + assert status["activeGitProcessCount"] == 1 + assert "still active" in status["reason"] + + +def test_non_repo_git_process_does_not_block_recovery(tmp_path): + repo_path, _, proc_root = _make_repo(tmp_path) + other_repo = tmp_path / "other" + other_repo.mkdir() + process_dir = proc_root / "123" + process_dir.mkdir() + (process_dir / "comm").write_text("git\n", encoding="utf-8") + (process_dir / "cmdline").write_bytes(b"/usr/bin/git\0fetch\0origin\0") + (process_dir / "cwd").symlink_to(other_repo) + + status = _inspect(repo_path, proc_root) + + assert status["canRecover"] is True + + +def test_symlink_lock_is_never_removed(tmp_path): + repo_path = tmp_path / "openpilot" + git_dir = repo_path / ".git" + git_dir.mkdir(parents=True) + target_path = tmp_path / "target" + target_path.write_text("keep", encoding="utf-8") + (git_dir / update_recovery.SHALLOW_LOCK_NAME).symlink_to(target_path) + proc_root = tmp_path / "proc" + proc_root.mkdir() + + recovered, status = update_recovery.recover_interrupted_update( + repo_path, + is_onroad=False, + update_running=False, + updater_state="idle", + now=1_000.0, + proc_root=proc_root, + ) + + assert recovered is False + assert status["canRecover"] is False + assert target_path.read_text(encoding="utf-8") == "keep" + + +def test_safe_recovery_removes_only_stale_shallow_lock(tmp_path): + repo_path, lock_path, proc_root = _make_repo(tmp_path) + keep_path = repo_path / ".git" / "index.lock" + keep_path.write_text("keep", encoding="utf-8") + + recovered, status = update_recovery.recover_interrupted_update( + repo_path, + is_onroad=False, + update_running=False, + updater_state="idle", + now=1_000.0, + proc_root=proc_root, + ) + + assert recovered is True + assert status["detected"] is False + assert not lock_path.exists() + assert keep_path.read_text(encoding="utf-8") == "keep" + + +def test_public_status_does_not_expose_lock_path_or_inode(tmp_path): + repo_path, _, proc_root = _make_repo(tmp_path) + + status = _inspect(repo_path, proc_root) + public_status = update_recovery.public_recovery_status(status) + + assert public_status["canRecover"] is True + assert all(not key.startswith("_") for key in public_status) diff --git a/starpilot/system/the_galaxy/the_galaxy.py b/starpilot/system/the_galaxy/the_galaxy.py index 14d557846..204c64c12 100644 --- a/starpilot/system/the_galaxy/the_galaxy.py +++ b/starpilot/system/the_galaxy/the_galaxy.py @@ -82,6 +82,7 @@ from openpilot.starpilot.common.testing_grounds import ( from openpilot.starpilot.navigation.destination_store import normalize_destination_payload, update_recent_destinations from openpilot.starpilot.system.the_galaxy.factory_reset import remove_path as _run_factory_reset_delete from openpilot.starpilot.system.the_galaxy import utilities +from openpilot.starpilot.system.the_galaxy.update_recovery import inspect_interrupted_update, public_recovery_status, recover_interrupted_update DISCORD_WEBHOOK_URL = os.getenv("DISCORD_WEBHOOK_URL") @@ -1092,6 +1093,15 @@ def _get_fast_update_state(): with _fast_update_lock: return dict(_fast_update_state) +def _get_interrupted_update_recovery(repo_path, state_data): + recovery_status = inspect_interrupted_update( + repo_path, + is_onroad=_safe_params_get_bool("IsOnroad"), + update_running=bool(state_data.get("running")), + updater_state=_safe_params_get("UpdaterState", encoding="utf-8", default=""), + ) + return public_recovery_status(recovery_status) + def _set_fast_update_progress(step, label, step_percent=0.0, detail=""): safe_step = max(1, min(_FAST_UPDATE_TOTAL_STEPS, int(step))) safe_step_percent = float(max(0.0, min(100.0, step_percent))) @@ -5317,15 +5327,58 @@ def setup(app): @app.route("/api/update/fast/status", methods=["GET"]) def get_fast_update_status(): state_data = _get_fast_update_state() + repo_path = str(_get_openpilot_root()) git_data = _collect_fast_update_info(include_remote=not state_data.get("running", False)) return jsonify({ **state_data, **git_data, "isOnroad": _safe_params_get_bool("IsOnroad"), "automaticUpdates": _safe_params_get_bool("AutomaticUpdates"), + "interruptedUpdateRecovery": _get_interrupted_update_recovery(repo_path, state_data), "warning": "Fast update skips backup creation and finalization safeguards.", }), 200 + @app.route("/api/update/recover", methods=["POST"]) + def recover_update(): + if _safe_params_get_bool("IsOnroad"): + return jsonify({"error": "Cannot recover an interrupted update while driving."}), 409 + + repo_path = str(_get_openpilot_root()) + with _fast_update_lock: + if _fast_update_state.get("running"): + return jsonify({"error": "An update action is still in progress."}), 409 + + recovered, recovery_status = recover_interrupted_update( + repo_path, + is_onroad=False, + update_running=False, + updater_state=_safe_params_get("UpdaterState", encoding="utf-8", default=""), + ) + if not recovered: + return jsonify({ + "error": recovery_status.get("reason") or "The interrupted update could not be recovered safely.", + "interruptedUpdateRecovery": recovery_status, + }), 409 + + _fast_update_state.update({ + "running": False, + "stage": "idle", + "message": "Interrupted update recovered. Ready to retry.", + "lastError": "", + "finishedAt": time.time(), + "progressStep": 0, + "progressTotalSteps": _FAST_UPDATE_TOTAL_STEPS, + "progressStepPercent": 0.0, + "progressPercent": 0.0, + "progressLabel": "Ready", + "progressDetail": "Abandoned update lock cleared safely.", + }) + + return jsonify({ + "message": "Interrupted update recovered. Retrying now...", + "interruptedUpdateRecovery": recovery_status, + }), 200 + @app.route("/api/update/branches", methods=["GET"]) def get_update_branches(): state_data = _get_fast_update_state() diff --git a/starpilot/system/the_galaxy/update_recovery.py b/starpilot/system/the_galaxy/update_recovery.py new file mode 100644 index 000000000..6798fe072 --- /dev/null +++ b/starpilot/system/the_galaxy/update_recovery.py @@ -0,0 +1,219 @@ +import os +import stat + +from datetime import datetime +from pathlib import Path + + +SHALLOW_LOCK_NAME = "shallow.lock" +MIN_STALE_LOCK_AGE_SECONDS = 30.0 + + +def _git_directory(repo_path): + repo = Path(repo_path).resolve() + dot_git = repo / ".git" + + if dot_git.is_dir(): + return dot_git.resolve() + + try: + marker = dot_git.read_text(encoding="utf-8", errors="replace").strip() + except OSError: + return None + + prefix = "gitdir:" + if not marker.lower().startswith(prefix): + return None + + git_dir = Path(marker[len(prefix):].strip()) + if not git_dir.is_absolute(): + git_dir = dot_git.parent / git_dir + try: + return git_dir.resolve() + except OSError: + return None + + +def _path_is_within(path, root): + try: + return os.path.commonpath((str(Path(path).resolve()), str(Path(root).resolve()))) == str(Path(root).resolve()) + except (OSError, ValueError): + return False + + +def _read_process_tokens(proc_dir): + try: + return [token.decode("utf-8", errors="replace") for token in (proc_dir / "cmdline").read_bytes().split(b"\0") if token] + except OSError: + return [] + + +def _process_is_git(proc_dir, tokens): + try: + command_name = (proc_dir / "comm").read_text(encoding="utf-8", errors="replace").strip().lower() + except OSError: + command_name = "" + + executable_name = Path(tokens[0]).name.lower() if tokens else "" + return command_name == "git" or command_name.startswith("git-") or executable_name == "git" or executable_name.startswith("git-") + + +def _process_uses_repo(proc_dir, tokens, repo_path, git_dir): + try: + cwd = Path(os.readlink(proc_dir / "cwd")) + except OSError: + cwd = None + + if cwd is not None and (_path_is_within(cwd, repo_path) or _path_is_within(cwd, git_dir)): + return True + + for index, token in enumerate(tokens): + candidate = "" + if token == "-C" and index + 1 < len(tokens): + candidate = tokens[index + 1] + elif token.startswith("--git-dir="): + candidate = token.split("=", 1)[1] + if not candidate: + continue + + candidate_path = Path(candidate) + if not candidate_path.is_absolute() and cwd is not None: + candidate_path = cwd / candidate_path + if _path_is_within(candidate_path, repo_path) or _path_is_within(candidate_path, git_dir): + return True + + return False + + +def _active_repo_git_processes(repo_path, git_dir, proc_root=Path("/proc")): + active_pids = [] + try: + process_dirs = [entry for entry in Path(proc_root).iterdir() if entry.name.isdigit()] + except OSError: + return active_pids + + for proc_dir in process_dirs: + tokens = _read_process_tokens(proc_dir) + if not _process_is_git(proc_dir, tokens): + continue + if _process_uses_repo(proc_dir, tokens, repo_path, git_dir): + active_pids.append(int(proc_dir.name)) + + return sorted(active_pids) + + +def inspect_interrupted_update(repo_path, *, is_onroad, update_running, updater_state, now=None, proc_root=Path("/proc")): + repo = Path(repo_path).resolve() + git_dir = _git_directory(repo) + if git_dir is None: + return { + "detected": False, + "canRecover": False, + "reason": "Git repository metadata could not be located.", + } + + lock_path = git_dir / SHALLOW_LOCK_NAME + try: + lock_stat = lock_path.lstat() + except FileNotFoundError: + return { + "detected": False, + "canRecover": False, + "reason": "No interrupted update lock detected.", + } + except OSError as exception: + return { + "detected": True, + "canRecover": False, + "reason": f"Unable to inspect the interrupted update lock: {exception}", + } + + status = { + "detected": True, + "canRecover": False, + "reason": "", + "ageSeconds": max(0, int((datetime.now().timestamp() if now is None else float(now)) - lock_stat.st_mtime)), + "_lockPath": lock_path, + "_lockDevice": lock_stat.st_dev, + "_lockInode": lock_stat.st_ino, + } + + if not stat.S_ISREG(lock_stat.st_mode): + status["reason"] = "The update lock is not a regular file and cannot be recovered automatically." + return status + if is_onroad: + status["reason"] = "Interrupted updates can only be recovered while parked." + return status + if update_running: + status["reason"] = "Galaxy reports that an update action is still running." + return status + + normalized_updater_state = str(updater_state or "").strip().lower() + if normalized_updater_state != "idle": + label = normalized_updater_state or "unknown" + status["reason"] = f"The system updater is currently {label}." + return status + + active_pids = _active_repo_git_processes(repo, git_dir, proc_root=proc_root) + if active_pids: + status["reason"] = "A Git update process is still active for this repository." + status["activeGitProcessCount"] = len(active_pids) + return status + + if status["ageSeconds"] < MIN_STALE_LOCK_AGE_SECONDS: + remaining = max(1, int(MIN_STALE_LOCK_AGE_SECONDS - status["ageSeconds"])) + status["reason"] = f"Waiting {remaining} more seconds before this lock can be treated as abandoned." + return status + + status["canRecover"] = True + status["reason"] = "An abandoned update lock can be recovered safely." + return status + + +def public_recovery_status(status): + return {key: value for key, value in status.items() if not key.startswith("_")} + + +def recover_interrupted_update(repo_path, *, is_onroad, update_running, updater_state, now=None, proc_root=Path("/proc")): + status = inspect_interrupted_update( + repo_path, + is_onroad=is_onroad, + update_running=update_running, + updater_state=updater_state, + now=now, + proc_root=proc_root, + ) + if not status.get("canRecover"): + return False, public_recovery_status(status) + + lock_path = status["_lockPath"] + try: + current_stat = lock_path.lstat() + except FileNotFoundError: + return True, { + "detected": False, + "canRecover": False, + "reason": "The interrupted update lock was already cleared.", + } + except OSError as exception: + status["canRecover"] = False + status["reason"] = f"Unable to recheck the interrupted update lock: {exception}" + return False, public_recovery_status(status) + + if not stat.S_ISREG(current_stat.st_mode) or current_stat.st_dev != status["_lockDevice"] or current_stat.st_ino != status["_lockInode"]: + status["canRecover"] = False + status["reason"] = "The update lock changed while it was being checked. Retry after current update activity stops." + return False, public_recovery_status(status) + + try: + lock_path.unlink() + except OSError as exception: + status["canRecover"] = False + status["reason"] = f"Unable to clear the interrupted update lock: {exception}" + return False, public_recovery_status(status) + + return True, { + "detected": False, + "canRecover": False, + "reason": "Interrupted update lock cleared.", + }