This commit is contained in:
firestar5683
2026-06-20 13:08:23 -05:00
parent fbb6eb651c
commit 42a23508b7
5 changed files with 558 additions and 28 deletions
@@ -136,6 +136,17 @@
font-weight: var(--font-weight-bold);
}
.updateRecovery {
border-left: 3px solid var(--main-fg);
margin-top: var(--margin-base);
padding-left: var(--padding-base);
}
.updateRecovery p {
font-size: var(--font-size-sm);
margin: var(--margin-xs) 0 var(--margin-sm);
}
.updateFooter {
border-top: 1px solid var(--track-color);
margin-top: var(--margin-base);
@@ -16,6 +16,7 @@ const state = reactive({
branchesBusy: false,
switchBusy: false,
rollbackBusy: false,
recoveryBusy: false,
})
let initialized = false
@@ -128,6 +129,7 @@ function hasRecordedRollbackTarget() {
function shouldShowPrimaryUpdateAction() {
if (state.status?.running) return true
if (state.status?.interruptedUpdateRecovery?.detected) return false
if (isSelectedBranchDifferent()) return true
return !!state.checkedForUpdates && !!state.status?.updateAvailable
}
@@ -137,7 +139,10 @@ function isFactoryResetStatusActive() {
}
function shouldContinuePolling() {
return !!state.status?.running || state.status?.stage === "rebooting" || reconnectPending
return !!state.status?.running
|| state.status?.stage === "rebooting"
|| !!state.status?.interruptedUpdateRecovery?.detected
|| reconnectPending
}
function shouldShowRebootNotice() {
@@ -372,7 +377,7 @@ async function setAutomaticUpdates(enabled) {
}
}
async function runFastUpdate() {
async function runFastUpdate(skipConfirmation = false) {
if (state.updateBusy) return
if (state.status?.running) {
showSnackbar("Fast update is already running.")
@@ -383,13 +388,15 @@ async function runFastUpdate() {
return
}
const confirmed = window.confirm(
"Fast update warning:\n\n" +
"- This update method skips backup creation.\n" +
"- Your device will reboot when the update is done.\n\n" +
"Continue with fast update?"
)
if (!confirmed) return
if (!skipConfirmation) {
const confirmed = window.confirm(
"Fast update warning:\n\n" +
"- This update method skips backup creation.\n" +
"- Your device will reboot when the update is done.\n\n" +
"Continue with fast update?"
)
if (!confirmed) return
}
state.updateBusy = true
try {
@@ -425,7 +432,7 @@ async function runFastUpdate() {
}
}
async function runBranchSwitch() {
async function runBranchSwitch(skipConfirmation = false) {
if (state.switchBusy) return
if (state.status?.running) {
showSnackbar("An update action is already running.")
@@ -444,13 +451,15 @@ async function runBranchSwitch() {
const currentBranch = String(state.status?.branch || "").trim()
const actionLabel = currentBranch && currentBranch === targetBranch ? "update" : "switch and update"
const confirmed = window.confirm(
`This will ${actionLabel} to the '${targetBranch}' branch.\n\n` +
"- This update method skips backup creation.\n" +
"- Your device will reboot when the update is done.\n\n" +
"Continue?"
)
if (!confirmed) return
if (!skipConfirmation) {
const confirmed = window.confirm(
`This will ${actionLabel} to the '${targetBranch}' branch.\n\n` +
"- This update method skips backup creation.\n" +
"- Your device will reboot when the update is done.\n\n" +
"Continue?"
)
if (!confirmed) return
}
state.switchBusy = true
try {
@@ -490,7 +499,7 @@ async function runBranchSwitch() {
}
}
async function runRollback() {
async function runRollback(skipConfirmation = false) {
if (state.rollbackBusy) return
if (state.status?.running) {
showSnackbar("An update action is already running.")
@@ -508,15 +517,17 @@ async function runRollback() {
return
}
const confirmed = window.confirm(
"Roll back to the previous installed version?\n\n" +
`Target: ${rollbackBranch || "Unknown"} @ ${shortHash(rollbackCommit)}\n\n` +
"- This restores the version this device was running before the last Galaxy update.\n" +
"- Automatic updates will be turned off.\n" +
"- Your device will reboot when the rollback is done.\n\n" +
"Continue?"
)
if (!confirmed) return
if (!skipConfirmation) {
const confirmed = window.confirm(
"Roll back to the previous installed version?\n\n" +
`Target: ${rollbackBranch || "Unknown"} @ ${shortHash(rollbackCommit)}\n\n` +
"- This restores the version this device was running before the last Galaxy update.\n" +
"- Automatic updates will be turned off.\n" +
"- Your device will reboot when the rollback is done.\n\n" +
"Continue?"
)
if (!confirmed) return
}
state.rollbackBusy = true
try {
@@ -554,6 +565,70 @@ async function runRollback() {
}
}
async function retryInterruptedUpdate() {
if (state.recoveryBusy) return
if (state.status?.isOnroad) {
showSnackbar("Actions are blocked while onroad.", "error")
return
}
const recovery = state.status?.interruptedUpdateRecovery || {}
if (!recovery.canRecover) {
showSnackbar(recovery.reason || "This update cannot be recovered safely yet.", "error")
return
}
const confirmed = window.confirm(
"Retry the interrupted update safely?\n\n" +
"Galaxy will verify that the vehicle is parked and no update or Git process is active. " +
"It will clear only the abandoned shallow-update lock, then retry the previous update action."
)
if (!confirmed) return
const previousMode = String(state.status?.lastMode || "").trim()
const previousBranch = String(state.status?.lastBranch || "").trim()
state.recoveryBusy = true
try {
const response = await fetch("/api/update/recover", { method: "POST" })
const payload = await readJsonPayload(response)
if (!response.ok) {
if (payload.interruptedUpdateRecovery) {
state.status = {
...(state.status || {}),
interruptedUpdateRecovery: payload.interruptedUpdateRecovery,
}
}
throw new Error(payload.error || response.statusText || "Failed to recover interrupted update")
}
state.status = {
...(state.status || {}),
running: false,
stage: "idle",
message: payload.message || "Interrupted update recovered. Retrying now...",
lastError: "",
interruptedUpdateRecovery: payload.interruptedUpdateRecovery || { detected: false, canRecover: false },
}
state.error = ""
showSnackbar(payload.message || "Interrupted update recovered. Retrying now...")
if (previousMode === "branch-switch" && previousBranch) {
state.selectedBranch = previousBranch
state.hasManualBranchSelection = true
await runBranchSwitch(true)
} else if (previousMode === "rollback" && state.status?.rollbackAvailable) {
await runRollback(true)
} else {
await runFastUpdate(true)
}
} catch (error) {
showSnackbar(error?.message || "Failed to recover interrupted update", "error")
await fetchStatus(false)
} finally {
state.recoveryBusy = false
}
}
function initialize() {
if (initialized) return
initialized = true
@@ -696,6 +771,19 @@ export function UpdateManager() {
${() => !isFactoryResetStatusActive() && state.status?.lastError ? html`<p class="updateError"><strong>Last Error:</strong> ${state.status.lastError}</p>` : ""}
${() => state.error ? html`<p class="updateError"><strong>Error:</strong> ${state.error}</p>` : ""}
${() => !state.status?.running && state.status?.interruptedUpdateRecovery?.detected ? html`
<div class="updateRecovery">
<strong>Interrupted update detected</strong>
<p>${state.status.interruptedUpdateRecovery.reason || "Galaxy found a shallow-update lock left by an interrupted update."}</p>
<button
class="updateButton"
disabled="${() => !!state.status?.isOnroad || !!state.status?.running || state.recoveryBusy || !state.status?.interruptedUpdateRecovery?.canRecover || false}"
@click="${() => retryInterruptedUpdate()}">
${state.recoveryBusy ? "Checking..." : "Retry Update Safely"}
</button>
</div>
` : ""}
<div class="updateActions">
${() => !isSelectedBranchDifferent() ? html`
<button class="updateButton" @click="${() => fetchStatus(true)}">
@@ -715,7 +803,7 @@ export function UpdateManager() {
</button>
` : ""}
</div>
${() => !state.status?.running && !shouldShowPrimaryUpdateAction()
${() => !state.status?.running && !state.status?.interruptedUpdateRecovery?.detected && !shouldShowPrimaryUpdateAction()
? html`<p class="updateHint">Run <strong>Check for Updates</strong> first, or select a different branch in advanced options.</p>`
: ""}
${() => shouldShowRebootNotice()
@@ -0,0 +1,159 @@
import importlib.util
import os
from pathlib import Path
def _load_update_recovery_module():
module_path = Path(__file__).resolve().parents[1] / "update_recovery.py"
spec = importlib.util.spec_from_file_location("update_recovery_under_test", module_path)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
return module
update_recovery = _load_update_recovery_module()
def _make_repo(tmp_path, *, lock_age=60.0, now=1_000.0):
repo_path = tmp_path / "openpilot"
git_dir = repo_path / ".git"
git_dir.mkdir(parents=True)
lock_path = git_dir / update_recovery.SHALLOW_LOCK_NAME
lock_path.write_text("", encoding="utf-8")
os.utime(lock_path, (now - lock_age, now - lock_age))
proc_root = tmp_path / "proc"
proc_root.mkdir()
return repo_path, lock_path, proc_root
def _inspect(repo_path, proc_root, *, now=1_000.0, is_onroad=False, update_running=False, updater_state="idle"):
return update_recovery.inspect_interrupted_update(
repo_path,
is_onroad=is_onroad,
update_running=update_running,
updater_state=updater_state,
now=now,
proc_root=proc_root,
)
def test_missing_lock_does_not_offer_recovery(tmp_path):
repo_path = tmp_path / "openpilot"
(repo_path / ".git").mkdir(parents=True)
proc_root = tmp_path / "proc"
proc_root.mkdir()
status = _inspect(repo_path, proc_root)
assert status["detected"] is False
assert status["canRecover"] is False
def test_recent_lock_waits_before_recovery(tmp_path):
repo_path, _, proc_root = _make_repo(tmp_path, lock_age=10.0)
status = _inspect(repo_path, proc_root)
assert status["detected"] is True
assert status["canRecover"] is False
assert "Waiting" in status["reason"]
def test_recovery_is_blocked_onroad_or_while_updater_is_busy(tmp_path):
repo_path, _, proc_root = _make_repo(tmp_path)
onroad_status = _inspect(repo_path, proc_root, is_onroad=True)
updater_status = _inspect(repo_path, proc_root, updater_state="downloading...")
galaxy_status = _inspect(repo_path, proc_root, update_running=True)
assert onroad_status["canRecover"] is False
assert "parked" in onroad_status["reason"]
assert updater_status["canRecover"] is False
assert "downloading" in updater_status["reason"]
assert galaxy_status["canRecover"] is False
assert "still running" in galaxy_status["reason"]
def test_active_repo_git_process_blocks_recovery(tmp_path):
repo_path, _, proc_root = _make_repo(tmp_path)
process_dir = proc_root / "123"
process_dir.mkdir()
(process_dir / "comm").write_text("git\n", encoding="utf-8")
(process_dir / "cmdline").write_bytes(b"/usr/bin/git\0fetch\0origin\0")
(process_dir / "cwd").symlink_to(repo_path)
status = _inspect(repo_path, proc_root)
assert status["canRecover"] is False
assert status["activeGitProcessCount"] == 1
assert "still active" in status["reason"]
def test_non_repo_git_process_does_not_block_recovery(tmp_path):
repo_path, _, proc_root = _make_repo(tmp_path)
other_repo = tmp_path / "other"
other_repo.mkdir()
process_dir = proc_root / "123"
process_dir.mkdir()
(process_dir / "comm").write_text("git\n", encoding="utf-8")
(process_dir / "cmdline").write_bytes(b"/usr/bin/git\0fetch\0origin\0")
(process_dir / "cwd").symlink_to(other_repo)
status = _inspect(repo_path, proc_root)
assert status["canRecover"] is True
def test_symlink_lock_is_never_removed(tmp_path):
repo_path = tmp_path / "openpilot"
git_dir = repo_path / ".git"
git_dir.mkdir(parents=True)
target_path = tmp_path / "target"
target_path.write_text("keep", encoding="utf-8")
(git_dir / update_recovery.SHALLOW_LOCK_NAME).symlink_to(target_path)
proc_root = tmp_path / "proc"
proc_root.mkdir()
recovered, status = update_recovery.recover_interrupted_update(
repo_path,
is_onroad=False,
update_running=False,
updater_state="idle",
now=1_000.0,
proc_root=proc_root,
)
assert recovered is False
assert status["canRecover"] is False
assert target_path.read_text(encoding="utf-8") == "keep"
def test_safe_recovery_removes_only_stale_shallow_lock(tmp_path):
repo_path, lock_path, proc_root = _make_repo(tmp_path)
keep_path = repo_path / ".git" / "index.lock"
keep_path.write_text("keep", encoding="utf-8")
recovered, status = update_recovery.recover_interrupted_update(
repo_path,
is_onroad=False,
update_running=False,
updater_state="idle",
now=1_000.0,
proc_root=proc_root,
)
assert recovered is True
assert status["detected"] is False
assert not lock_path.exists()
assert keep_path.read_text(encoding="utf-8") == "keep"
def test_public_status_does_not_expose_lock_path_or_inode(tmp_path):
repo_path, _, proc_root = _make_repo(tmp_path)
status = _inspect(repo_path, proc_root)
public_status = update_recovery.public_recovery_status(status)
assert public_status["canRecover"] is True
assert all(not key.startswith("_") for key in public_status)
+53
View File
@@ -82,6 +82,7 @@ from openpilot.starpilot.common.testing_grounds import (
from openpilot.starpilot.navigation.destination_store import normalize_destination_payload, update_recent_destinations
from openpilot.starpilot.system.the_galaxy.factory_reset import remove_path as _run_factory_reset_delete
from openpilot.starpilot.system.the_galaxy import utilities
from openpilot.starpilot.system.the_galaxy.update_recovery import inspect_interrupted_update, public_recovery_status, recover_interrupted_update
DISCORD_WEBHOOK_URL = os.getenv("DISCORD_WEBHOOK_URL")
@@ -1092,6 +1093,15 @@ def _get_fast_update_state():
with _fast_update_lock:
return dict(_fast_update_state)
def _get_interrupted_update_recovery(repo_path, state_data):
recovery_status = inspect_interrupted_update(
repo_path,
is_onroad=_safe_params_get_bool("IsOnroad"),
update_running=bool(state_data.get("running")),
updater_state=_safe_params_get("UpdaterState", encoding="utf-8", default=""),
)
return public_recovery_status(recovery_status)
def _set_fast_update_progress(step, label, step_percent=0.0, detail=""):
safe_step = max(1, min(_FAST_UPDATE_TOTAL_STEPS, int(step)))
safe_step_percent = float(max(0.0, min(100.0, step_percent)))
@@ -5317,15 +5327,58 @@ def setup(app):
@app.route("/api/update/fast/status", methods=["GET"])
def get_fast_update_status():
state_data = _get_fast_update_state()
repo_path = str(_get_openpilot_root())
git_data = _collect_fast_update_info(include_remote=not state_data.get("running", False))
return jsonify({
**state_data,
**git_data,
"isOnroad": _safe_params_get_bool("IsOnroad"),
"automaticUpdates": _safe_params_get_bool("AutomaticUpdates"),
"interruptedUpdateRecovery": _get_interrupted_update_recovery(repo_path, state_data),
"warning": "Fast update skips backup creation and finalization safeguards.",
}), 200
@app.route("/api/update/recover", methods=["POST"])
def recover_update():
if _safe_params_get_bool("IsOnroad"):
return jsonify({"error": "Cannot recover an interrupted update while driving."}), 409
repo_path = str(_get_openpilot_root())
with _fast_update_lock:
if _fast_update_state.get("running"):
return jsonify({"error": "An update action is still in progress."}), 409
recovered, recovery_status = recover_interrupted_update(
repo_path,
is_onroad=False,
update_running=False,
updater_state=_safe_params_get("UpdaterState", encoding="utf-8", default=""),
)
if not recovered:
return jsonify({
"error": recovery_status.get("reason") or "The interrupted update could not be recovered safely.",
"interruptedUpdateRecovery": recovery_status,
}), 409
_fast_update_state.update({
"running": False,
"stage": "idle",
"message": "Interrupted update recovered. Ready to retry.",
"lastError": "",
"finishedAt": time.time(),
"progressStep": 0,
"progressTotalSteps": _FAST_UPDATE_TOTAL_STEPS,
"progressStepPercent": 0.0,
"progressPercent": 0.0,
"progressLabel": "Ready",
"progressDetail": "Abandoned update lock cleared safely.",
})
return jsonify({
"message": "Interrupted update recovered. Retrying now...",
"interruptedUpdateRecovery": recovery_status,
}), 200
@app.route("/api/update/branches", methods=["GET"])
def get_update_branches():
state_data = _get_fast_update_state()
@@ -0,0 +1,219 @@
import os
import stat
from datetime import datetime
from pathlib import Path
SHALLOW_LOCK_NAME = "shallow.lock"
MIN_STALE_LOCK_AGE_SECONDS = 30.0
def _git_directory(repo_path):
repo = Path(repo_path).resolve()
dot_git = repo / ".git"
if dot_git.is_dir():
return dot_git.resolve()
try:
marker = dot_git.read_text(encoding="utf-8", errors="replace").strip()
except OSError:
return None
prefix = "gitdir:"
if not marker.lower().startswith(prefix):
return None
git_dir = Path(marker[len(prefix):].strip())
if not git_dir.is_absolute():
git_dir = dot_git.parent / git_dir
try:
return git_dir.resolve()
except OSError:
return None
def _path_is_within(path, root):
try:
return os.path.commonpath((str(Path(path).resolve()), str(Path(root).resolve()))) == str(Path(root).resolve())
except (OSError, ValueError):
return False
def _read_process_tokens(proc_dir):
try:
return [token.decode("utf-8", errors="replace") for token in (proc_dir / "cmdline").read_bytes().split(b"\0") if token]
except OSError:
return []
def _process_is_git(proc_dir, tokens):
try:
command_name = (proc_dir / "comm").read_text(encoding="utf-8", errors="replace").strip().lower()
except OSError:
command_name = ""
executable_name = Path(tokens[0]).name.lower() if tokens else ""
return command_name == "git" or command_name.startswith("git-") or executable_name == "git" or executable_name.startswith("git-")
def _process_uses_repo(proc_dir, tokens, repo_path, git_dir):
try:
cwd = Path(os.readlink(proc_dir / "cwd"))
except OSError:
cwd = None
if cwd is not None and (_path_is_within(cwd, repo_path) or _path_is_within(cwd, git_dir)):
return True
for index, token in enumerate(tokens):
candidate = ""
if token == "-C" and index + 1 < len(tokens):
candidate = tokens[index + 1]
elif token.startswith("--git-dir="):
candidate = token.split("=", 1)[1]
if not candidate:
continue
candidate_path = Path(candidate)
if not candidate_path.is_absolute() and cwd is not None:
candidate_path = cwd / candidate_path
if _path_is_within(candidate_path, repo_path) or _path_is_within(candidate_path, git_dir):
return True
return False
def _active_repo_git_processes(repo_path, git_dir, proc_root=Path("/proc")):
active_pids = []
try:
process_dirs = [entry for entry in Path(proc_root).iterdir() if entry.name.isdigit()]
except OSError:
return active_pids
for proc_dir in process_dirs:
tokens = _read_process_tokens(proc_dir)
if not _process_is_git(proc_dir, tokens):
continue
if _process_uses_repo(proc_dir, tokens, repo_path, git_dir):
active_pids.append(int(proc_dir.name))
return sorted(active_pids)
def inspect_interrupted_update(repo_path, *, is_onroad, update_running, updater_state, now=None, proc_root=Path("/proc")):
repo = Path(repo_path).resolve()
git_dir = _git_directory(repo)
if git_dir is None:
return {
"detected": False,
"canRecover": False,
"reason": "Git repository metadata could not be located.",
}
lock_path = git_dir / SHALLOW_LOCK_NAME
try:
lock_stat = lock_path.lstat()
except FileNotFoundError:
return {
"detected": False,
"canRecover": False,
"reason": "No interrupted update lock detected.",
}
except OSError as exception:
return {
"detected": True,
"canRecover": False,
"reason": f"Unable to inspect the interrupted update lock: {exception}",
}
status = {
"detected": True,
"canRecover": False,
"reason": "",
"ageSeconds": max(0, int((datetime.now().timestamp() if now is None else float(now)) - lock_stat.st_mtime)),
"_lockPath": lock_path,
"_lockDevice": lock_stat.st_dev,
"_lockInode": lock_stat.st_ino,
}
if not stat.S_ISREG(lock_stat.st_mode):
status["reason"] = "The update lock is not a regular file and cannot be recovered automatically."
return status
if is_onroad:
status["reason"] = "Interrupted updates can only be recovered while parked."
return status
if update_running:
status["reason"] = "Galaxy reports that an update action is still running."
return status
normalized_updater_state = str(updater_state or "").strip().lower()
if normalized_updater_state != "idle":
label = normalized_updater_state or "unknown"
status["reason"] = f"The system updater is currently {label}."
return status
active_pids = _active_repo_git_processes(repo, git_dir, proc_root=proc_root)
if active_pids:
status["reason"] = "A Git update process is still active for this repository."
status["activeGitProcessCount"] = len(active_pids)
return status
if status["ageSeconds"] < MIN_STALE_LOCK_AGE_SECONDS:
remaining = max(1, int(MIN_STALE_LOCK_AGE_SECONDS - status["ageSeconds"]))
status["reason"] = f"Waiting {remaining} more seconds before this lock can be treated as abandoned."
return status
status["canRecover"] = True
status["reason"] = "An abandoned update lock can be recovered safely."
return status
def public_recovery_status(status):
return {key: value for key, value in status.items() if not key.startswith("_")}
def recover_interrupted_update(repo_path, *, is_onroad, update_running, updater_state, now=None, proc_root=Path("/proc")):
status = inspect_interrupted_update(
repo_path,
is_onroad=is_onroad,
update_running=update_running,
updater_state=updater_state,
now=now,
proc_root=proc_root,
)
if not status.get("canRecover"):
return False, public_recovery_status(status)
lock_path = status["_lockPath"]
try:
current_stat = lock_path.lstat()
except FileNotFoundError:
return True, {
"detected": False,
"canRecover": False,
"reason": "The interrupted update lock was already cleared.",
}
except OSError as exception:
status["canRecover"] = False
status["reason"] = f"Unable to recheck the interrupted update lock: {exception}"
return False, public_recovery_status(status)
if not stat.S_ISREG(current_stat.st_mode) or current_stat.st_dev != status["_lockDevice"] or current_stat.st_ino != status["_lockInode"]:
status["canRecover"] = False
status["reason"] = "The update lock changed while it was being checked. Retry after current update activity stops."
return False, public_recovery_status(status)
try:
lock_path.unlink()
except OSError as exception:
status["canRecover"] = False
status["reason"] = f"Unable to clear the interrupted update lock: {exception}"
return False, public_recovery_status(status)
return True, {
"detected": False,
"canRecover": False,
"reason": "Interrupted update lock cleared.",
}