Increase Fault Resilience

2026-07-03 04:22:09 +08:00 · 2026-02-05 00:04:15 -06:00
parent 69fa82d3b4
commit 456e3435d5
2 changed files with 31 additions and 16 deletions
@@ -804,7 +804,8 @@ class Panda:
  # The panda will NAK CAN writes when there is CAN congestion.
  # libusb will try to send it again, with a max timeout.
  # Timeout is in ms. If set to 0, the timeout is infinite.
-  CAN_SEND_TIMEOUT_MS = 10
+  CAN_SEND_TIMEOUT_MS = 5
+  CAN_MAX_RETRIES = 3

  def can_reset_communications(self):
    self._handle.controlWrite(Panda.REQUEST_OUT, 0xc0, 0, 0, b'')
@@ -812,18 +813,18 @@ class Panda:
  @ensure_can_packet_version
  def can_send_many(self, arr, timeout=CAN_SEND_TIMEOUT_MS):
    snds = pack_can_buffer(arr)
-    while True:
-      try:
-        for tx in snds:
-          while True:
-            bs = self._handle.bulkWrite(3, tx, timeout=timeout)
-            tx = tx[bs:]
-            if len(tx) == 0:
-              break
-            logging.error("CAN: PARTIAL SEND MANY, RETRYING")
-        break
-      except (usb1.USBErrorIO, usb1.USBErrorOverflow):
-        logging.error("CAN: BAD SEND MANY, RETRYING")
+    for tx in snds:
+      retries = 0
+      while len(tx) > 0:
+        bs = self._handle.bulkWrite(3, tx, timeout=timeout)
+        if bs == 0:
+          retries += 1
+          if retries > self.CAN_MAX_RETRIES:
+            logging.warning("CAN send: no progress after retries, dropping")
+            break
+        else:
+          retries = 0
+        tx = tx[bs:]

  def can_send(self, addr, dat, bus, timeout=CAN_SEND_TIMEOUT_MS):
    self.can_send_many([[addr, None, dat, bus]], timeout=timeout)
@@ -831,13 +832,16 @@ class Panda:
  @ensure_can_packet_version
  def can_recv(self):
    dat = bytearray()
-    while True:
+    for _ in range(self.CAN_MAX_RETRIES):
      try:
        dat = self._handle.bulkRead(1, 16384) # Max receive batch size + 2 extra reserve frames
        break
      except (usb1.USBErrorIO, usb1.USBErrorOverflow):
        logging.error("CAN: BAD RECV, RETRYING")
-        time.sleep(0.1)
+        time.sleep(0.01)
+    else:
+      logging.error("CAN: recv failed after retries")
+      return []
    msgs, self.can_rx_overflow_buffer = unpack_can_buffer(self.can_rx_overflow_buffer + dat)
    return msgs

@@ -27,7 +27,10 @@ NACK = 0x1F
 CHECKSUM_START = 0xAB

 MIN_ACK_TIMEOUT_MS = 100
+MAX_ACK_TIMEOUT_MS = 500  # like C++ SPI_ACK_TIMEOUT
+DEFAULT_TIMEOUT_MS = 500  # default when timeout=0
 MAX_XFER_RETRY_COUNT = 5
+MAX_TIMEOUT_RETRIES = 5  # like C++

 XFER_SIZE = 0x40*31

@@ -152,6 +155,8 @@ class PandaSpiHandle(BaseHandle):
    return cksum

  def _wait_for_ack(self, spi, ack_val: int, timeout: int, tx: int, length: int = 1) -> bytes:
+    # Original behavior preserved - timeout=0 means wait forever within this function
+    # The caller (_transfer) handles the overall timeout
    timeout_s = max(MIN_ACK_TIMEOUT_MS, timeout) * 1e-3

    start = time.monotonic()
@@ -225,10 +230,15 @@ class PandaSpiHandle(BaseHandle):
    logging.debug("starting transfer: endpoint=%d, max_rx_len=%d", endpoint, max_rx_len)
    logging.debug("==============================================")

+    # Fix timeout=0 infinite loop: default to DEFAULT_TIMEOUT_MS
+    if timeout == 0:
+      timeout = DEFAULT_TIMEOUT_MS
+
    n = 0
    start_time = time.monotonic()
    exc = PandaSpiException()
-    while (timeout == 0) or (time.monotonic() - start_time) < timeout*1e-3:
+    # Use the timeout for the overall loop, matching original behavior but with timeout=0 fixed
+    while (time.monotonic() - start_time) < timeout * 1e-3:
      n += 1
      logging.debug("\ntry #%d", n)
      with self.dev.acquire() as spi:
@@ -238,6 +248,7 @@ class PandaSpiHandle(BaseHandle):
          exc = e
          logging.debug("SPI transfer failed, retrying", exc_info=True)

+    logging.error("SPI transfer failed after %d tries, %.2fms", n, (time.monotonic() - start_time) * 1000)
    raise exc

  def get_protocol_version(self) -> bytes: