WifiManager: fix deterministic state mismatches (#37407)

* hmm

* revert to master

* context too big

* fresh context

* early return

* early return

* tests

* restore cmts

* lester nester

* note

* add

* final review

* cmt
This commit is contained in:
Shane Smiskol
2026-02-25 17:25:31 -08:00
committed by GitHub
parent 7835b9aa17
commit bcb4a6a3e3
2 changed files with 37 additions and 51 deletions
@@ -3,8 +3,8 @@
Tests the state machine in isolation by constructing a WifiManager with mocked
DBus, then calling _handle_state_change directly with NM state transitions.
Many tests assert *desired* behavior that the current code doesn't implement yet.
These are marked with pytest.mark.xfail and document the intended fix.
Remaining xfail tests cover thread races (monitor vs main thread) and deferred
features (SSID_NOT_FOUND UI error).
"""
import pytest
from pytest_mock import MockerFixture
@@ -72,7 +72,6 @@ class TestDisconnected:
assert wm._wifi_state.ssid == "OldNet"
assert wm._wifi_state.status == ConnectStatus.CONNECTED
@pytest.mark.xfail(reason="TODO: CONNECTION_REMOVED should only clear if ssid not in _connections")
def test_connection_removed_keeps_other_connecting(self, mocker):
"""Forget A while connecting to B: CONNECTION_REMOVED for A must not clear B."""
wm = _make_wm(mocker, connections={"B": "/path/B"})
@@ -95,12 +94,8 @@ class TestDisconnected:
class TestDeactivating:
@pytest.mark.xfail(reason="TODO: DEACTIVATING should be a no-op")
def test_deactivating_is_noop(self, mocker):
"""DEACTIVATING should be a no-op — DISCONNECTED follows with correct state.
Fix: remove the entire DEACTIVATING elif block — do nothing for any reason.
"""
"""DEACTIVATING is a no-op — DISCONNECTED follows with the correct reason."""
wm = _make_wm(mocker)
wm._wifi_state = WifiState(ssid="Net", status=ConnectStatus.CONNECTED)
@@ -111,7 +106,6 @@ class TestDeactivating:
class TestPrepareConfig:
@pytest.mark.xfail(reason="TODO: should skip DBus lookup when ssid already set")
def test_user_initiated_skips_dbus_lookup(self, mocker):
"""User called _set_connecting('B') — PREPARE must not overwrite via DBus.
@@ -309,9 +303,9 @@ class TestActivated:
# Uses side_effect on the DBus mock to simulate _set_connecting running mid-handler.
# ---------------------------------------------------------------------------
# The deterministic fixes (skip DBus lookup when ssid already set, prev_state guard
# on NEED_AUTH) also shrink these race windows to near-zero. If races are still
# visible after, make WifiState frozen (replace() + single atomic assignment) and/or
# add a narrow lock around _wifi_state reads/writes (not around DBus calls).
# on NEED_AUTH, DEACTIVATING no-op, CONNECTION_REMOVED guard) shrink these race
# windows to near-zero. If still visible, make WifiState frozen (replace() + single
# atomic assignment) and/or add a narrow lock around _wifi_state reads/writes.
class TestThreadRaces:
@pytest.mark.xfail(reason="TODO: PREPARE overwrites _set_connecting via stale DBus lookup")
@@ -496,7 +490,6 @@ class TestFullSequences:
fire_wpa_connect(wm)
assert wm._wifi_state.status == ConnectStatus.CONNECTED
@pytest.mark.xfail(reason="TODO: forget A while connecting to B should not clear B")
def test_forget_A_connect_B(self, mocker):
"""Forget A while connecting to B: full signal sequence.
@@ -508,7 +501,7 @@ class TestFullSequences:
Signal order:
1. User: _set_connecting("B"), forget("A") removes A from _connections
2. NewConnection for B arrives → _connections["B"] = ...
3. DEACTIVATING(CONNECTION_REMOVED) — should be no-op
3. DEACTIVATING(CONNECTION_REMOVED) — no-op
4. DISCONNECTED(CONNECTION_REMOVED) — B is in _connections, must not clear
5. PREPARE → CONFIG → NEED_AUTH → PREPARE → CONFIG → ... → ACTIVATED
"""
@@ -536,7 +529,6 @@ class TestFullSequences:
assert wm._wifi_state.status == ConnectStatus.CONNECTED
assert wm._wifi_state.ssid == "B"
@pytest.mark.xfail(reason="TODO: forget A while connecting to B should not clear B")
def test_forget_A_connect_B_late_new_connection(self, mocker):
"""Forget A, connect B: NewConnection for B arrives AFTER DISCONNECTED.
+30 -36
View File
@@ -374,40 +374,36 @@ class WifiManager:
self._handle_state_change(new_state, previous_state, change_reason)
def _handle_state_change(self, new_state: int, prev_state: int, change_reason: int):
# TODO: known race conditions when switching networks (e.g. forget A, connect to B):
# 1. DEACTIVATING/DISCONNECTED + CONNECTION_REMOVED: fires before NewConnection for B
# arrives, so _set_connecting(None) clears B's CONNECTING state causing UI flicker.
# DEACTIVATING(CONNECTION_REMOVED): wifi_state (B, CONNECTING) -> (None, DISCONNECTED)
# Fix: make DEACTIVATING a no-op, and guard DISCONNECTED with
# `if wifi_state.ssid not in _connections` (NewConnection arrives between the two).
# 2. PREPARE/CONFIG ssid lookup: DBus may return stale A's conn_path, overwriting B.
# PREPARE(0): wifi_state (B, CONNECTING) -> (A, CONNECTING)
# Fix: only do DBus lookup when wifi_state.ssid is None (auto-connections);
# user-initiated connections already have ssid set via _set_connecting.
# TODO: Thread safety — _wifi_state is read/written by both the monitor thread (this
# handler) and the main thread (_set_connecting via connect/activate). The GIL makes
# individual assignments atomic, but read-then-write patterns can lose main thread writes:
# PREPARE/CONFIG: reads _wifi_state, does slow DBus call, writes back — if
# _set_connecting runs in between, the handler overwrites it with stale state.
# This is both a deterministic bug (stale DBus data) and a thread race. The
# deterministic fix (skip DBus lookup when ssid is set) also shrinks the race
# window to near-zero since the read/write happen back-to-back under the GIL.
# ACTIVATED: same read-then-write pattern with a DBus call in between.
# The deterministic fixes (skip DBus lookup when ssid set, prev_state guard) shrink
# the race windows significantly. If still visible, add a narrow lock around
# _wifi_state reads/writes (not around DBus calls, to avoid blocking the UI thread).
# individual assignments atomic, but ACTIVATED still has a read-then-write pattern with
# a DBus call in between: if _set_connecting runs mid-call, the handler overwrites it.
# The deterministic fixes (skip DBus lookup when ssid set, prev_state guard, DEACTIVATING
# no-op, CONNECTION_REMOVED guard) shrink the race windows significantly. If still
# visible, add a narrow lock around _wifi_state reads/writes (not around DBus calls).
# TODO: Handle (FAILED, SSID_NOT_FOUND) and emit for ui to show error
# TODO: Handle (FAILED, SSID_NOT_FOUND) and emit for UI to show error
# Happens when network drops off after starting connection
if new_state == NMDeviceState.DISCONNECTED:
if change_reason != NMDeviceStateReason.NEW_ACTIVATION:
# catches CONNECTION_REMOVED reason when connection is forgotten
self._set_connecting(None)
if change_reason == NMDeviceStateReason.NEW_ACTIVATION:
return
# Guard: forget A while connecting to B fires CONNECTION_REMOVED. Don't clear B's state
# if B is still a known connection. If B hasn't arrived in _connections yet (late
# NewConnection), state clears here but PREPARE recovers via DBus lookup.
if (change_reason == NMDeviceStateReason.CONNECTION_REMOVED and self._wifi_state.ssid and
self._wifi_state.ssid in self._connections):
return
self._set_connecting(None)
elif new_state in (NMDeviceState.PREPARE, NMDeviceState.CONFIG):
# Set connecting status when NetworkManager connects to known networks on its own
if self._wifi_state.ssid is not None:
self._wifi_state = replace(self._wifi_state, status=ConnectStatus.CONNECTING)
return
# Auto-connection when NetworkManager connects to known networks on its own (ssid=None): look up ssid from NM
wifi_state = replace(self._wifi_state, status=ConnectStatus.CONNECTING)
conn_path, _ = self._get_active_wifi_connection(self._conn_monitor)
@@ -444,25 +440,23 @@ class WifiManager:
conn_path, _ = self._get_active_wifi_connection(self._conn_monitor)
if conn_path is None:
cloudlog.warning("Failed to get active wifi connection during ACTIVATED state")
self._wifi_state = wifi_state
self._enqueue_callbacks(self._activated)
self._update_active_connection_info()
else:
wifi_state.ssid = next((s for s, p in self._connections.items() if p == conn_path), None)
self._wifi_state = wifi_state
self._enqueue_callbacks(self._activated)
self._update_active_connection_info()
# Persist volatile connections (created by AddAndActivateConnection2) to disk
self._wifi_state = wifi_state
self._enqueue_callbacks(self._activated)
self._update_active_connection_info()
# Persist volatile connections (created by AddAndActivateConnection2) to disk
if conn_path is not None:
conn_addr = DBusAddress(conn_path, bus_name=NM, interface=NM_CONNECTION_IFACE)
save_reply = self._conn_monitor.send_and_get_reply(new_method_call(conn_addr, 'Save'))
if save_reply.header.message_type == MessageType.error:
cloudlog.warning(f"Failed to persist connection to disk: {save_reply}")
elif new_state == NMDeviceState.DEACTIVATING:
if change_reason == NMDeviceStateReason.CONNECTION_REMOVED:
# When connection is forgotten
self._set_connecting(None)
# no-op — DISCONNECTED always follows with the correct reason
pass
def _network_scanner(self):
while not self._exit: