WifiManager: fix some threading race conditions (#37406)

* interesting epoch approach

* repro

* determ fix

* cmts

* new issue

* test

* clean up

* cmt

* add back

* reorg cmt

* cmt

* clean up

* cmt
This commit is contained in:
Shane Smiskol
2026-02-25 19:09:11 -08:00
committed by GitHub
parent 1550520b63
commit c2a7437972
2 changed files with 31 additions and 17 deletions
@@ -2,9 +2,6 @@
Tests the state machine in isolation by constructing a WifiManager with mocked
DBus, then calling _handle_state_change directly with NM state transitions.
Remaining xfail tests cover thread races (monitor vs main thread) and deferred
features (SSID_NOT_FOUND UI error).
"""
import pytest
from jeepney.low_level import MessageType
@@ -22,6 +19,7 @@ def _make_wm(mocker: MockerFixture, connections=None):
wm._conn_monitor = mocker.MagicMock()
wm._connections = dict(connections or {})
wm._wifi_state = WifiState()
wm._user_epoch = 0
wm._callback_queue = []
wm._need_auth = []
wm._activated = []
@@ -302,14 +300,14 @@ class TestActivated:
# ---------------------------------------------------------------------------
# Thread races: _set_connecting on main thread vs _handle_state_change on monitor thread.
# Uses side_effect on the DBus mock to simulate _set_connecting running mid-handler.
# The epoch counter detects that a user action occurred during the slow DBus call
# and discards the stale update.
# ---------------------------------------------------------------------------
# The deterministic fixes (skip DBus lookup when ssid already set, prev_state guard
# on NEED_AUTH, DEACTIVATING no-op, CONNECTION_REMOVED guard) shrink these race
# windows to near-zero. If still visible, make WifiState frozen (replace() + single
# atomic assignment) and/or add a narrow lock around _wifi_state reads/writes.
# windows significantly. The epoch counter closes the remaining gaps.
class TestThreadRaces:
@pytest.mark.xfail(reason="TODO: PREPARE overwrites _set_connecting via stale DBus lookup")
def test_prepare_race_user_tap_during_dbus(self, mocker):
"""User taps B while PREPARE's DBus call is in flight for auto-connect.
@@ -329,7 +327,6 @@ class TestThreadRaces:
assert wm._wifi_state.ssid == "B"
assert wm._wifi_state.status == ConnectStatus.CONNECTING
@pytest.mark.xfail(reason="TODO: ACTIVATED overwrites _set_connecting with stale CONNECTED state")
def test_activated_race_user_tap_during_dbus(self, mocker):
"""User taps B right as A finishes connecting (ACTIVATED handler running).
+27 -10
View File
@@ -175,6 +175,7 @@ class WifiManager:
# State
self._connections: dict[str, str] = {} # ssid -> connection path, updated via NM signals
self._wifi_state: WifiState = WifiState()
self._user_epoch: int = 0
self._ipv4_address: str = ""
self._current_network_metered: MeteredType = MeteredType.UNKNOWN
self._tethering_password: str = ""
@@ -291,6 +292,8 @@ class WifiManager:
return self._tethering_password
def _set_connecting(self, ssid: str | None):
# Called by user action, or sequentially from state change handler
self._user_epoch += 1
self._wifi_state = WifiState(ssid=ssid, status=ConnectStatus.DISCONNECTED if ssid is None else ConnectStatus.CONNECTING)
def _enqueue_callbacks(self, cbs: list[Callable], *args):
@@ -374,13 +377,16 @@ class WifiManager:
self._handle_state_change(new_state, previous_state, change_reason)
def _handle_state_change(self, new_state: int, prev_state: int, change_reason: int):
# TODO: Thread safety _wifi_state is read/written by both the monitor thread (this
# handler) and the main thread (_set_connecting via connect/activate). The GIL makes
# individual assignments atomic, but ACTIVATED still has a read-then-write pattern with
# a DBus call in between: if _set_connecting runs mid-call, the handler overwrites it.
# The deterministic fixes (skip DBus lookup when ssid set, prev_state guard, DEACTIVATING
# no-op, CONNECTION_REMOVED guard) shrink the race windows significantly. If still
# visible, add a narrow lock around _wifi_state reads/writes (not around DBus calls).
# Thread safety: _wifi_state is read/written by both the monitor thread (this handler)
# and the main thread (_set_connecting via connect/activate). PREPARE/CONFIG and ACTIVATED
# have a read-then-write pattern with a slow DBus call in between — if _set_connecting
# runs mid-call, the handler would overwrite the user's newer state with stale data.
#
# The _user_epoch counter solves this without locks. _set_connecting increments the epoch
# on every user action. Handlers snapshot the epoch before their DBus call and compare
# after: if it changed, a user action occurred during the call and the stale result is
# discarded. Combined with deterministic fixes (skip DBus lookup when ssid already set,
# DEACTIVATING no-op, CONNECTION_REMOVED guard), all known race windows are closed.
# TODO: Handle (FAILED, SSID_NOT_FOUND) and emit for UI to show error
# Happens when network drops off after starting connection
@@ -399,6 +405,8 @@ class WifiManager:
self._set_connecting(None)
elif new_state in (NMDeviceState.PREPARE, NMDeviceState.CONFIG):
epoch = self._user_epoch
if self._wifi_state.ssid is not None:
self._wifi_state = replace(self._wifi_state, status=ConnectStatus.CONNECTING)
return
@@ -407,6 +415,10 @@ class WifiManager:
wifi_state = replace(self._wifi_state, status=ConnectStatus.CONNECTING)
conn_path, _ = self._get_active_wifi_connection(self._conn_monitor)
if self._user_epoch != epoch:
return
if conn_path is None:
cloudlog.warning("Failed to get active wifi connection during PREPARE/CONFIG state")
else:
@@ -417,14 +429,14 @@ class WifiManager:
# BAD PASSWORD
# - strong network rejects with NEED_AUTH+SUPPLICANT_DISCONNECT
# - weak/gone network fails with FAILED+NO_SECRETS
# prev_state guard: real auth failures come from CONFIG (supplicant handshake).
# Stale NEED_AUTH from a prior connection during network switching arrives with
# prev_state=DISCONNECTED and must be ignored to avoid a false wrong-password callback.
# TODO: sometimes on PC it's observed no future signals are fired if mouse is held down blocking wrong password dialog
elif ((new_state == NMDeviceState.NEED_AUTH and change_reason == NMDeviceStateReason.SUPPLICANT_DISCONNECT
and prev_state == NMDeviceState.CONFIG) or
(new_state == NMDeviceState.FAILED and change_reason == NMDeviceStateReason.NO_SECRETS)):
# prev_state guard: real auth failures come from CONFIG (supplicant handshake).
# Stale NEED_AUTH from a prior connection during network switching arrives with
# prev_state=DISCONNECTED and must be ignored to avoid a false wrong-password callback.
if self._wifi_state.ssid:
self._enqueue_callbacks(self._need_auth, self._wifi_state.ssid)
self._set_connecting(None)
@@ -435,9 +447,14 @@ class WifiManager:
elif new_state == NMDeviceState.ACTIVATED:
# Note that IP address from Ip4Config may not be propagated immediately and could take until the next scan results
epoch = self._user_epoch
wifi_state = replace(self._wifi_state, status=ConnectStatus.CONNECTED)
conn_path, _ = self._get_active_wifi_connection(self._conn_monitor)
if self._user_epoch != epoch:
return
if conn_path is None:
cloudlog.warning("Failed to get active wifi connection during ACTIVATED state")
else: