From 6bcf4a99f24d5fc0114e595ea87dbb7ca04a4b9a Mon Sep 17 00:00:00 2001 From: Alex Cheema Date: Sun, 3 May 2026 02:34:21 +0100 Subject: [PATCH] Tune cluster liveness polling cadence --- src/exo/master/main.py | 7 +++++-- src/exo/worker/main.py | 4 +++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/exo/master/main.py b/src/exo/master/main.py index 679797cfb6..1daa5387bc 100644 --- a/src/exo/master/main.py +++ b/src/exo/master/main.py @@ -485,6 +485,9 @@ async def _command_processor(self) -> None: # These plan loops are the cracks showing in our event sourcing architecture - more things could be commands async def _plan(self) -> None: + node_inactivity_timeout = timedelta(seconds=5) + tick_interval_seconds = 1.0 + while True: # kill broken instances connected_node_ids = set(self.state.topology.list_nodes()) @@ -499,11 +502,11 @@ async def _plan(self) -> None: # time out dead nodes for node_id, time in self.state.last_seen.items(): now = datetime.now(tz=timezone.utc) - if now - time > timedelta(seconds=30): + if now - time > node_inactivity_timeout: logger.info(f"Manually removing node {node_id} due to inactivity") await self.event_sender.send(NodeTimedOut(node_id=node_id)) - await anyio.sleep(10) + await anyio.sleep(tick_interval_seconds) async def _event_processor(self) -> None: with self.local_event_receiver as local_events: diff --git a/src/exo/worker/main.py b/src/exo/worker/main.py index 1b76efce82..417ce992df 100644 --- a/src/exo/worker/main.py +++ b/src/exo/worker/main.py @@ -445,6 +445,8 @@ def _create_supervisor(self, task: CreateRunner) -> RunnerSupervisor: return runner async def _poll_connection_updates(self): + poll_interval_seconds = 2.0 + while True: edges = set( conn.edge for conn in self.state.topology.out_edges(self.node_id) @@ -487,4 +489,4 @@ async def _poll_connection_updates(self): logger.debug(f"ping failed to discover {conn=}") await self.event_sender.send(TopologyEdgeDeleted(conn=conn)) - await anyio.sleep(10) + await anyio.sleep(poll_interval_seconds)