Index: cloudsync/utils/watchdog.py =================================================================== diff -u -r4c51787148b0f3db26bdc8e27d7632e7f972a86a -rf8e86cb57ba35fd2a6f6ec1b7989443bff221b0d --- cloudsync/utils/watchdog.py (.../watchdog.py) (revision 4c51787148b0f3db26bdc8e27d7632e7f972a86a) +++ cloudsync/utils/watchdog.py (.../watchdog.py) (revision f8e86cb57ba35fd2a6f6ec1b7989443bff221b0d) @@ -4,10 +4,10 @@ on failure (up to a configurable limit), and creates a sentinel file to request full process restart when recovery is exhausted. -F1 — 0.5.5: extends the watchdog with liveness-progress monitoring. +Liveness-progress monitoring (in addition to ``is_alive()`` checks). ``threading.Thread.is_alive()`` returns True for threads blocked in kernel -syscalls, ``event.wait()``, or busy-waits. The HD000118-style stuck-but-alive -patterns observed in DIAS-43 go completely undetected by is_alive() alone. +syscalls, ``event.wait()``, or busy-waits. Stuck-but-alive patterns are +invisible to is_alive() alone. The extended API accepts an optional ``progress_fn`` and ``max_idle_s`` per registration. On each sweep the watchdog treats stale progress the same way @@ -20,8 +20,8 @@ ``register_sentinel_condition`` provides a second escalation path: an arbitrary zero-arg predicate checked every sweep. If it returns True, the watchdog immediately creates the sentinel (no restart attempt). Used for -chronic queue-exhaustion (F4 counter) where individual threads stay alive -but the pipeline is starved. +chronic queue-exhaustion (queue-full event counter) where individual +threads stay alive but the pipeline is starved. """ import os @@ -50,7 +50,7 @@ self._max_restarts = max_restarts self._sentinel_path = sentinel_path self._entries = {} - # F1 — sentinel-condition registry (name → predicate) + # Sentinel-condition registry (name → predicate) self._sentinel_conditions = {} self._stop_event = Event() self._thread = Thread(target=self._monitor, daemon=True) @@ -69,10 +69,10 @@ :param callable restart_fn: Zero-arg callable that creates and starts a replacement thread. Use :func:`make_restart_fn` for the common case of daemon threads backed by a bound method. - :param callable progress_fn: (F1) optional zero-arg callable returning - a unix-epoch timestamp of the thread's last semantic progress. + :param callable progress_fn: Optional zero-arg callable returning + a monotonic timestamp of the thread's last semantic progress. If omitted, only dead-thread detection is performed. - :param float max_idle_s: (F1) maximum seconds of staleness tolerated + :param float max_idle_s: Maximum seconds of staleness tolerated before the thread counts as failed. Required when ``progress_fn`` is provided. """ @@ -92,7 +92,7 @@ Used for conditions where individual threads are alive but the pipeline as a whole is starved — e.g. chronic queue saturation - (F4 ``helpers_queue_full_event_count`` > threshold). + (``helpers_queue_full_event_count`` > threshold). :param str name: Diagnostic label (appears in sentinel file + logs). :param callable condition_fn: Zero-arg callable returning a bool. @@ -132,7 +132,7 @@ def _check_threads(self): """Single sweep: check every registered thread + sentinel condition.""" - # F1: first evaluate sentinel conditions — they short-circuit thread + # First evaluate sentinel conditions — they short-circuit thread # checks. A True predicate means the pipeline is chronically # starved and restarting individual threads will not help. for name, condition_fn in self._sentinel_conditions.items(): @@ -153,7 +153,7 @@ thread = entry["get_thread"]() is_dead = thread is None or not thread.is_alive() - # F1: check progress staleness even when the thread is alive. + # Check progress staleness even when the thread is alive. is_stale = False stale_reason = "" if not is_dead and entry["progress_fn"] is not None: @@ -169,7 +169,7 @@ if last_ts == 0: is_stale = False else: - # F13 — compare in monotonic time. A wall-clock step + # Compare in monotonic time. A wall-clock step # (operator `date` command or ntpd correction) must not # trip the watchdog; only real elapsed idle time matters. age = monotonic() - last_ts @@ -179,7 +179,7 @@ % (age, entry["max_idle_s"])) if not is_dead and not is_stale: - # F1: reset failure counter on a fully-healthy read. This is + # Reset failure counter on a fully-healthy read. This is # the "3 consecutive stale reads" rule — any healthy read # wipes the slate clean and prevents restart-storm escalation # on transient slowness.