Increase Fault Resilience

This commit is contained in:
firestar5683
2026-02-05 00:04:15 -06:00
parent 69fa82d3b4
commit 456e3435d5
2 changed files with 31 additions and 16 deletions
+19 -15
View File
@@ -804,7 +804,8 @@ class Panda:
# The panda will NAK CAN writes when there is CAN congestion.
# libusb will try to send it again, with a max timeout.
# Timeout is in ms. If set to 0, the timeout is infinite.
CAN_SEND_TIMEOUT_MS = 10
CAN_SEND_TIMEOUT_MS = 5
CAN_MAX_RETRIES = 3
def can_reset_communications(self):
self._handle.controlWrite(Panda.REQUEST_OUT, 0xc0, 0, 0, b'')
@@ -812,18 +813,18 @@ class Panda:
@ensure_can_packet_version
def can_send_many(self, arr, timeout=CAN_SEND_TIMEOUT_MS):
snds = pack_can_buffer(arr)
while True:
try:
for tx in snds:
while True:
bs = self._handle.bulkWrite(3, tx, timeout=timeout)
tx = tx[bs:]
if len(tx) == 0:
break
logging.error("CAN: PARTIAL SEND MANY, RETRYING")
break
except (usb1.USBErrorIO, usb1.USBErrorOverflow):
logging.error("CAN: BAD SEND MANY, RETRYING")
for tx in snds:
retries = 0
while len(tx) > 0:
bs = self._handle.bulkWrite(3, tx, timeout=timeout)
if bs == 0:
retries += 1
if retries > self.CAN_MAX_RETRIES:
logging.warning("CAN send: no progress after retries, dropping")
break
else:
retries = 0
tx = tx[bs:]
def can_send(self, addr, dat, bus, timeout=CAN_SEND_TIMEOUT_MS):
self.can_send_many([[addr, None, dat, bus]], timeout=timeout)
@@ -831,13 +832,16 @@ class Panda:
@ensure_can_packet_version
def can_recv(self):
dat = bytearray()
while True:
for _ in range(self.CAN_MAX_RETRIES):
try:
dat = self._handle.bulkRead(1, 16384) # Max receive batch size + 2 extra reserve frames
break
except (usb1.USBErrorIO, usb1.USBErrorOverflow):
logging.error("CAN: BAD RECV, RETRYING")
time.sleep(0.1)
time.sleep(0.01)
else:
logging.error("CAN: recv failed after retries")
return []
msgs, self.can_rx_overflow_buffer = unpack_can_buffer(self.can_rx_overflow_buffer + dat)
return msgs
+12 -1
View File
@@ -27,7 +27,10 @@ NACK = 0x1F
CHECKSUM_START = 0xAB
MIN_ACK_TIMEOUT_MS = 100
MAX_ACK_TIMEOUT_MS = 500 # like C++ SPI_ACK_TIMEOUT
DEFAULT_TIMEOUT_MS = 500 # default when timeout=0
MAX_XFER_RETRY_COUNT = 5
MAX_TIMEOUT_RETRIES = 5 # like C++
XFER_SIZE = 0x40*31
@@ -152,6 +155,8 @@ class PandaSpiHandle(BaseHandle):
return cksum
def _wait_for_ack(self, spi, ack_val: int, timeout: int, tx: int, length: int = 1) -> bytes:
# Original behavior preserved - timeout=0 means wait forever within this function
# The caller (_transfer) handles the overall timeout
timeout_s = max(MIN_ACK_TIMEOUT_MS, timeout) * 1e-3
start = time.monotonic()
@@ -225,10 +230,15 @@ class PandaSpiHandle(BaseHandle):
logging.debug("starting transfer: endpoint=%d, max_rx_len=%d", endpoint, max_rx_len)
logging.debug("==============================================")
# Fix timeout=0 infinite loop: default to DEFAULT_TIMEOUT_MS
if timeout == 0:
timeout = DEFAULT_TIMEOUT_MS
n = 0
start_time = time.monotonic()
exc = PandaSpiException()
while (timeout == 0) or (time.monotonic() - start_time) < timeout*1e-3:
# Use the timeout for the overall loop, matching original behavior but with timeout=0 fixed
while (time.monotonic() - start_time) < timeout * 1e-3:
n += 1
logging.debug("\ntry #%d", n)
with self.dev.acquire() as spi:
@@ -238,6 +248,7 @@ class PandaSpiHandle(BaseHandle):
exc = e
logging.debug("SPI transfer failed, retrying", exc_info=True)
logging.error("SPI transfer failed after %d tries, %.2fms", n, (time.monotonic() - start_time) * 1000)
raise exc
def get_protocol_version(self) -> bytes: