165 lines
4.1 KiB
Python
165 lines
4.1 KiB
Python
#!/usr/bin/env python3
|
|
import json
|
|
import os
|
|
import signal
|
|
import subprocess
|
|
import sys
|
|
import time
|
|
|
|
from collections import defaultdict, deque
|
|
from pathlib import Path
|
|
|
|
from openpilot.common.basedir import BASEDIR
|
|
from openpilot.common.swaglog import cloudlog
|
|
|
|
MAPD_DIR = Path(BASEDIR) / "frogpilot/navigation"
|
|
MAPD_BIN = MAPD_DIR / "mapd"
|
|
OFFLINE_ROOT = Path("/data/media/0/osm/offline")
|
|
RESTART_DELAY_S = 0.25
|
|
MISSING_TILE_BACKOFF_S = 30.0
|
|
FAILURE_WINDOW_S = 3.0
|
|
FAILURE_THRESHOLD = 3
|
|
|
|
|
|
def extract_bounds_filename(line: str) -> str | None:
|
|
try:
|
|
payload = json.loads(line)
|
|
except json.JSONDecodeError:
|
|
return None
|
|
|
|
if payload.get("msg") != "Loading bounds file":
|
|
return None
|
|
|
|
filename = payload.get("filename")
|
|
return filename if isinstance(filename, str) else None
|
|
|
|
|
|
def is_offline_read_error(line: str) -> bool:
|
|
try:
|
|
payload = json.loads(line)
|
|
except json.JSONDecodeError:
|
|
return False
|
|
|
|
return payload.get("msg") == "could not unmarshal offline data"
|
|
|
|
|
|
class CorruptTileMonitor:
|
|
def __init__(self, threshold: int = FAILURE_THRESHOLD, window_s: float = FAILURE_WINDOW_S):
|
|
self.threshold = threshold
|
|
self.window_s = window_s
|
|
self.current_filename: str | None = None
|
|
self.failures: dict[str, deque[float]] = defaultdict(deque)
|
|
|
|
def observe(self, line: str, now: float | None = None) -> str | None:
|
|
filename = extract_bounds_filename(line)
|
|
if filename is not None:
|
|
self.current_filename = filename
|
|
return None
|
|
|
|
if not is_offline_read_error(line) or self.current_filename is None:
|
|
return None
|
|
|
|
ts = time.monotonic() if now is None else now
|
|
failures = self.failures[self.current_filename]
|
|
failures.append(ts)
|
|
|
|
cutoff = ts - self.window_s
|
|
while failures and failures[0] < cutoff:
|
|
failures.popleft()
|
|
|
|
if len(failures) >= self.threshold:
|
|
return self.current_filename
|
|
return None
|
|
|
|
|
|
def quarantine_offline_tile(filename: str) -> Path | None:
|
|
tile_path = Path(filename)
|
|
try:
|
|
tile_path.relative_to(OFFLINE_ROOT)
|
|
except ValueError:
|
|
cloudlog.warning(f"mapd_wrapper refusing to quarantine unexpected path: {filename}")
|
|
return None
|
|
|
|
if not tile_path.exists():
|
|
return None
|
|
|
|
quarantined = tile_path.with_name(f"{tile_path.name}.corrupt.{int(time.time())}")
|
|
tile_path.rename(quarantined)
|
|
return quarantined
|
|
|
|
|
|
def terminate_child(proc: subprocess.Popen[str]) -> None:
|
|
if proc.poll() is not None:
|
|
return
|
|
|
|
proc.terminate()
|
|
try:
|
|
proc.wait(timeout=2)
|
|
except subprocess.TimeoutExpired:
|
|
proc.kill()
|
|
proc.wait(timeout=2)
|
|
|
|
|
|
def run_mapd_once() -> int:
|
|
proc = subprocess.Popen(
|
|
[MAPD_BIN.as_posix()],
|
|
cwd=MAPD_DIR,
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.STDOUT,
|
|
text=True,
|
|
bufsize=1,
|
|
)
|
|
assert proc.stdout is not None
|
|
|
|
def _handle_signal(signum, _frame):
|
|
terminate_child(proc)
|
|
raise SystemExit(128 + signum)
|
|
|
|
signal.signal(signal.SIGTERM, _handle_signal)
|
|
signal.signal(signal.SIGINT, _handle_signal)
|
|
|
|
monitor = CorruptTileMonitor()
|
|
|
|
for line in proc.stdout:
|
|
print(line, end="")
|
|
bad_tile = monitor.observe(line)
|
|
if bad_tile is None:
|
|
continue
|
|
|
|
quarantined = quarantine_offline_tile(bad_tile)
|
|
if quarantined is None:
|
|
if not OFFLINE_ROOT.exists():
|
|
cloudlog.warning(
|
|
f"mapd_wrapper detected repeated offline read failures for {bad_tile}, "
|
|
f"but {OFFLINE_ROOT} does not exist; backing off mapd restarts"
|
|
)
|
|
terminate_child(proc)
|
|
return 2
|
|
|
|
cloudlog.warning(f"mapd_wrapper detected repeated offline read failures for {bad_tile}, but could not quarantine it")
|
|
else:
|
|
message = f"mapd_wrapper quarantined corrupt offline tile: {bad_tile} -> {quarantined}"
|
|
print(message, flush=True)
|
|
cloudlog.warning(message)
|
|
|
|
terminate_child(proc)
|
|
return 1 if quarantined is not None else 2
|
|
|
|
return proc.wait()
|
|
|
|
|
|
def main() -> None:
|
|
while True:
|
|
exit_code = run_mapd_once()
|
|
if exit_code == 1:
|
|
time.sleep(RESTART_DELAY_S)
|
|
continue
|
|
if exit_code == 2:
|
|
time.sleep(MISSING_TILE_BACKOFF_S)
|
|
continue
|
|
raise SystemExit(exit_code)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|