Index: cloud_sync.py =================================================================== diff -u -r79dadd7422855193f345beced530c6b92fc49b3a -r8dda975c3b3303764304ed174177c22bbf0f6367 --- cloud_sync.py (.../cloud_sync.py) (revision 79dadd7422855193f345beced530c6b92fc49b3a) +++ cloud_sync.py (.../cloud_sync.py) (revision 8dda975c3b3303764304ed174177c22bbf0f6367) @@ -150,11 +150,15 @@ # Heartbeat gets the idempotent lane + g_config so it can drain # rotated CS-log files one-per-tick when that lane is idle. Registration # mode sets send_heartbeat=False below, so opportunistic CS-log upload - # stays inert during registration. + # stays inert during registration. log_handler drives the silent-day + # force-rotate path so devices that don't emit (e.g. logLevel=ERROR + # with no errors) still produce one rotated file per UTC day for the + # opportunistic upload step above to send. heartbeat_provider = HeartBeatProvider(logger=app.logger, network_request_handler=network_request_handler, output_channel=output_channel, idempotent_network_request_handler=idempotent_network_request_handler, - g_config=g_config) + g_config=g_config, + log_handler=logconf.log_handler) # CS-log uploads (SEND_CS_LOG) go through the idempotent lane. logconf.set_network_provider(network_request_handler=idempotent_network_request_handler) Index: cloudsync/handlers/cs_mft_dcs_request_handler.py =================================================================== diff -u -r79dadd7422855193f345beced530c6b92fc49b3a -r8dda975c3b3303764304ed174177c22bbf0f6367 --- cloudsync/handlers/cs_mft_dcs_request_handler.py (.../cs_mft_dcs_request_handler.py) (revision 79dadd7422855193f345beced530c6b92fc49b3a) +++ cloudsync/handlers/cs_mft_dcs_request_handler.py (.../cs_mft_dcs_request_handler.py) (revision 8dda975c3b3303764304ed174177c22bbf0f6367) @@ -667,6 +667,21 @@ organization_id = response.get("associatedOrganizationId", None) cs_log_data['organizationId'] = organization_id + # System-tenant default: when the device is not yet bound to + # a customer tenant, route the cs-log to organizationId=1 + # (DCS system tenant) instead of skipping. Keeps diagnostic + # data flowing to a Diality-operator-visible view during the + # un-associated window without co-mingling into a customer + # tenant later. See CS_LOG_SYSTEM_TENANT_ORG_ID in globals. + if organization_id is None: + organization_id = CS_LOG_SYSTEM_TENANT_ORG_ID + cs_log_data['organizationId'] = organization_id + self.logger.info( + "Device not yet associated with a customer tenant; " + "routing CS-log to system tenant (organizationId=%d): %s", + organization_id, cs_log_data.get('path', ''), + ) + # Step #3 - upload the cs log file if not os.path.exists(cs_log_data['path']): @@ -684,12 +699,31 @@ correlation_id=_cid, device_sn=_dsn) if isinstance(cs_log_filename, dict) and not cs_log_filename.get("accepted"): - self.logger.info(f"CS log file rejected as duplicate: {cs_log_filename.get('filename')}") - try: - os.remove(cs_log_data['path']) - self.logger.info(f"Duplicate CS log file cleaned up: {cs_log_data['path']}") - except OSError: - pass # already removed by log retention + reason_code = cs_log_filename.get("reason_code") + if reason_code == LogUploadReasonCode.DUPLICATE.value: + # DCS already has the file -- safe to drop the local copy. + self.logger.info( + f"CS log file rejected as duplicate (DCS already has it): " + f"{cs_log_filename.get('filename')}" + ) + try: + os.remove(cs_log_data['path']) + self.logger.info(f"Duplicate CS log file cleaned up: {cs_log_data['path']}") + except OSError: + pass # already removed by log retention + else: + # Transient (network, 5xx, 400, refresh-failed 401, give-up) + # or non-duplicate failure. Retain the local file; the + # opportunistic heartbeat tick (~20 s when idle + reachable) + # will pick it up again from the on-disk backlog. Upload is + # decoupled from midnight rotation per heartbeat.py:51-53. + # Unconditional deletion here was the data-loss root cause + # closed by this fix. + self.logger.error( + f"CS log upload failed (reason_code={reason_code}); " + f"local file retained for next opportunistic upload tick: " + f"{cs_log_data['path']}" + ) elif isinstance(cs_log_filename, str): self.logger.debug(f"CS log file uploaded: {cs_log_filename}") try: Index: cloudsync/handlers/logs_handler.py =================================================================== diff -u -r79dadd7422855193f345beced530c6b92fc49b3a -r8dda975c3b3303764304ed174177c22bbf0f6367 --- cloudsync/handlers/logs_handler.py (.../logs_handler.py) (revision 79dadd7422855193f345beced530c6b92fc49b3a) +++ cloudsync/handlers/logs_handler.py (.../logs_handler.py) (revision 8dda975c3b3303764304ed174177c22bbf0f6367) @@ -22,6 +22,8 @@ so no prior rotated file is silently overwritten. """ +import logging +import sys from logging.handlers import TimedRotatingFileHandler from cloudsync.utils.helpers import * @@ -132,6 +134,112 @@ except Exception: pass + def force_rotate_if_date_changed(self): + """Force a rotation when the UTC date has advanced past + ``self._active_log_date``, even when no log emit has happened + in the interim. + + The stdlib ``TimedRotatingFileHandler`` invokes + ``shouldRollover()`` only from inside ``emit()``. A silent-day + device (``logLevel=ERROR`` and not erroring) never emits, so it + never rotates, so the heartbeat opportunistic-upload pathway + has nothing to send. This method is the heartbeat's hook to + force a rotation on those silent days. + + Writes a self-describing marker line into the active stream + before rotating, so the rotated file is non-empty (an empty + rotated file would be indistinguishable from a corrupted log + to downstream tooling). Marker shape matches the file's default + formatter (logging.py:default_formatter) but is constructed + manually because we write directly to the stream — emitting a + record would re-enter the very handler we're rotating. + + Same-UTC-date is the fast path no-op (called every heartbeat + tick, ~20 s). + + Lock-guarded under ``self.lock`` (the stdlib handler's internal + RLock) so it serialises with concurrent ``emit()`` calls. Any + exception raised inside is swallowed — rotation failure MUST + NOT propagate to break the heartbeat tick. + """ + try: + current_date = self._current_utc_date() + except Exception: + return + if current_date == self._active_log_date: + return + + with self.lock: + # Re-check inside the lock: a concurrent emit() may have + # already triggered a stdlib rotation that updated + # _active_log_date. + try: + if self._current_utc_date() == self._active_log_date: + return + except Exception: + return + + try: + prior_date = self._active_log_date + # Resolve effective log-level word from the handler's + # own threshold (set by LoggingConfig.set_log_level via + # self.log_handler.setLevel). + try: + level_word = logging.getLevelName(self.level).lower() + if not level_word or level_word.startswith("level "): + level_word = "unknown" + except Exception: + level_word = "unknown" + + # Resolve device serial. g_config is set via + # set_configuration() but may be None at startup or in + # a test harness — fall back to the same string the + # CorrelationFilter falls back to (logging.py). + try: + if self.g_config is not None: + hd_serial = self.g_config[CONFIG_DEVICE][ + CONFIG_DEVICE_HD_SERIAL] + else: + hd_serial = "unregistered" + except Exception: + hd_serial = "unregistered" + + marker = ( + "[{date} 23:59:59,999] INFO [] [{sn}] in logs_handler: " + "No logs available for the covered interval with log " + "level {level} | " + "{{logs_handler.py:force_rotate_if_date_changed}}\n" + ).format( + date=prior_date.isoformat(), + sn=hd_serial, + level=level_word, + ) + + if self.stream is None: + # Delay-open mode and no emit has happened yet on + # this active file. Open so the marker has somewhere + # to land. + self.stream = self._open() + self.stream.write(marker) + try: + self.stream.flush() + except Exception: + pass + + self.doRollover() + except Exception as exc: + # Best-effort: rotation failure MUST NOT propagate. The + # next heartbeat tick retries. Logging through the + # regular logger could recurse into the very handler in + # a bad state, so write the warning directly to stderr + # (mirrors stdlib Handler.handleError's default). + try: + sys.stderr.write( + "WARNING force_rotate_if_date_changed failed: " + "{0}\n".format(exc)) + except Exception: + pass + def set_network_provider(self, network_request_handler): self.network_request_handler = network_request_handler Index: cloudsync/handlers/ui_cs_request_handler.py =================================================================== diff -u -r79dadd7422855193f345beced530c6b92fc49b3a -r8dda975c3b3303764304ed174177c22bbf0f6367 --- cloudsync/handlers/ui_cs_request_handler.py (.../ui_cs_request_handler.py) (revision 79dadd7422855193f345beced530c6b92fc49b3a) +++ cloudsync/handlers/ui_cs_request_handler.py (.../ui_cs_request_handler.py) (revision 8dda975c3b3303764304ed174177c22bbf0f6367) @@ -237,10 +237,11 @@ ErrorIDs.CS_SAVE_CONFIG_ERROR.value, "Error updating device config file") self._enqueue_error(error, message.correlation_id) - except Exception as e: + except Exception: + self.logger.exception("1001 unexpected failure during device registration") error = Error.general(OutboundMessageIDs.CS2UI_ERROR.value, ErrorIDs.CS_REQ_REGISTRATION_ERROR.value, - str(e)) + "Internal error during device registration") self._enqueue_error(error, message.correlation_id) # OPERATION MODE @@ -289,17 +290,30 @@ ErrorIDs.CS_SEND_DEVICE_STATE_ERROR.value, "Queue {0}: {1}".format(outcome, detail)) self._enqueue_error(err, message.correlation_id) - except Exception as e: + except Exception: + self.logger.exception("1006 unexpected failure sending device state") error = Error.general(OutboundMessageIDs.CS2UI_ERROR.value, ErrorIDs.CS_SEND_DEVICE_STATE_ERROR.value, - str(e)) + "Internal error sending device state") self._enqueue_error(error, message.correlation_id) # SEND TREATMENT REPORT REQUEST elif InboundMessageIDs.mapped_str_value(message.ID) == InboundMessageIDs.UI2CS_SEND_TREATMENT_REPORT and \ (message.g_config[CONFIG_DEVICE][CONFIG_DEVICE_MODE] == 'operation'): self.logger.info("UI2CS_SEND_TREATMENT_REPORT request received") + # Customer-readable wire-frame strings per typed parse-error + # category. Comma-free by construction so the CSV-encoded + # 2999,...,907, frame does not split mid-message on + # the receiver. Full Python detail (when present) reaches + # cloudsync.log via the catch-all branch below, never the + # wire frame. + _TREATMENT_REPORT_ERROR_MAP = { + "missing template": "Treatment report template not available on device", + "missing input file": "Treatment log file not available on device", + "schema mismatch": "Treatment log file format not recognised", + } + try: hd_serial_number = message.g_config[CONFIG_DEVICE][CONFIG_DEVICE_HD_SERIAL] dg_serial_number = message.g_config[CONFIG_DEVICE][CONFIG_DEVICE_DG_SERIAL] @@ -308,44 +322,54 @@ self.logger.debug('hd: {0},dg: {1},sw: {2}'.format(hd_serial_number, dg_serial_number, sw_version)) treatment_log_json = helpers_read_treatment_log_file(message.parameters[0]) - if treatment_log_json: - treatment_log_json['checksum'] = helpers_sha256_string(json.dumps(treatment_log_json['data'])) - treatment_log_json['serialNumber'] = message.g_config[CONFIG_DEVICE][CONFIG_DEVICE_HD_SERIAL] + treatment_log_json['checksum'] = helpers_sha256_string(json.dumps(treatment_log_json['data'])) + treatment_log_json['serialNumber'] = message.g_config[CONFIG_DEVICE][CONFIG_DEVICE_HD_SERIAL] - g_utils.logger.debug("Treatment log {0}".format(treatment_log_json)) + g_utils.logger.debug("Treatment log {0}".format(treatment_log_json)) - # SEND_TREATMENT_REPORT — treatment-report loss is - # user-visible; surface a CS2UI_ERROR so UI-Brain - # can prompt a retry / mark the report as not synced. - outcome, detail = helpers_add_to_network_queue( - network_request_handler=self.network_request_handler, - request_type=NetworkRequestType.CS2DCS_REQ_SEND_TREATMENT_REPORT, - url='', - payload=treatment_log_json, - method='', - g_config=message.g_config, - success_message='CS2DCS_REQ_SEND_TREATMENT_REPORT request added to network' - 'queue', - correlation_id=message.correlation_id) - if outcome != "queued": - self.logger.error( - "CS2DCS_REQ_SEND_TREATMENT_REPORT not queued (%s): %s", - outcome, detail) - err = Error.general(OutboundMessageIDs.CS2UI_ERROR.value, - ErrorIDs.CS_SEND_TREATMENT_REPORT_ERROR.value, - "Queue {0}: {1}".format(outcome, detail)) - self._enqueue_error(err, message.correlation_id) - else: - error = Error.general(OutboundMessageIDs.CS2UI_ERROR.value, - ErrorIDs.CS_SEND_TREATMENT_REPORT_ERROR.value, - f"Failed to read treatment log file: {message.parameters[0]}") - self._enqueue_error(error, message.correlation_id) + # SEND_TREATMENT_REPORT — treatment-report loss is + # user-visible; surface a CS2UI_ERROR so UI-Brain + # can prompt a retry / mark the report as not synced. + outcome, detail = helpers_add_to_network_queue( + network_request_handler=self.network_request_handler, + request_type=NetworkRequestType.CS2DCS_REQ_SEND_TREATMENT_REPORT, + url='', + payload=treatment_log_json, + method='', + g_config=message.g_config, + success_message='CS2DCS_REQ_SEND_TREATMENT_REPORT request added to network' + 'queue', + correlation_id=message.correlation_id) + if outcome != "queued": + self.logger.error( + "CS2DCS_REQ_SEND_TREATMENT_REPORT not queued (%s): %s", + outcome, detail) + err = Error.general(OutboundMessageIDs.CS2UI_ERROR.value, + ErrorIDs.CS_SEND_TREATMENT_REPORT_ERROR.value, + "Queue {0}: {1}".format(outcome, detail)) + self._enqueue_error(err, message.correlation_id) - except Exception as e: + except TreatmentLogParseError as tple: + category = tple.args[0] if tple.args else "" + msg = _TREATMENT_REPORT_ERROR_MAP.get( + category, "Treatment report could not be prepared" + ) + self.logger.error( + "1007 treatment-report parse failure (category=%s): %s", + category, message.parameters[0], + ) error = Error.general(OutboundMessageIDs.CS2UI_ERROR.value, ErrorIDs.CS_SEND_TREATMENT_REPORT_ERROR.value, - str(e)) + msg) self._enqueue_error(error, message.correlation_id) + except Exception: + # Full traceback to cloudsync.log only; wire frame + # carries a curated comma-free generic string. + self.logger.exception("1007 unexpected failure preparing treatment report") + error = Error.general(OutboundMessageIDs.CS2UI_ERROR.value, + ErrorIDs.CS_SEND_TREATMENT_REPORT_ERROR.value, + "Internal error preparing treatment report") + self._enqueue_error(error, message.correlation_id) # DECOMMISSIONING REQUEST elif InboundMessageIDs.mapped_str_value(message.ID) == InboundMessageIDs.UI2CS_REQ_DECOMMISSION and \ @@ -357,10 +381,11 @@ message_body = str( OutboundMessageIDs.CS2UI_REQ_DEVICE_DECOMMISSIONED.value) + ',0' self.output_channel.enqueue_message(message_body) - except Exception as e: + except Exception: + self.logger.exception("UI2CS_REQ_DECOMMISSION unexpected failure") error = Error.general(OutboundMessageIDs.CS2UI_ERROR.value, ErrorIDs.CS_REQ_DECOMMISSION_ERROR.value, - str(e)) + "Internal error during device decommissioning") self._enqueue_error(error, message.correlation_id) # CHECK-IN REQUEST @@ -372,10 +397,11 @@ message_body = str( OutboundMessageIDs.CS2UI_REQ_CHECKIN.value) + ',0' self.output_channel.enqueue_message(message_body) - except Exception as e: + except Exception: + self.logger.exception("UI2CS_SEND_CHECKIN unexpected failure") error = Error.general(OutboundMessageIDs.CS2UI_ERROR.value, ErrorIDs.CS_REQ_CHECKIN_ERROR.value, - str(e)) + "Internal error during check-in") self._enqueue_error(error, message.correlation_id) # CS LOG RETENTION @@ -388,10 +414,11 @@ message_body = str( OutboundMessageIDs.CS2UI_REQ_LOG_RETENTION.value) + f',{num_of_files}' + f',{del_size_mb}' self.output_channel.enqueue_message(message_body) - except Exception as e: + except Exception: + self.logger.exception("UI2CS_REQ_LOG_RETENTION unexpected failure") error = Error.general(OutboundMessageIDs.CS2UI_ERROR.value, ErrorIDs.CS_LOG_RETENTION_ERROR.value, - str(e)) + "Internal error applying log retention") self._enqueue_error(error, message.correlation_id) else: error = Error.general(OutboundMessageIDs.CS2UI_ERROR.value, Index: cloudsync/utils/globals.py =================================================================== diff -u -r79dadd7422855193f345beced530c6b92fc49b3a -r8dda975c3b3303764304ed174177c22bbf0f6367 --- cloudsync/utils/globals.py (.../globals.py) (revision 79dadd7422855193f345beced530c6b92fc49b3a) +++ cloudsync/utils/globals.py (.../globals.py) (revision 8dda975c3b3303764304ed174177c22bbf0f6367) @@ -150,6 +150,21 @@ # LOGS UPLOAD TEMPLATE PATH LOG_UPLOAD_TEMPLATE_PATH = "cloudsync/config/log_upload_template.json" +# DCS SYSTEM TENANT FOR UN-ATTRIBUTED CS-LOG UPLOADS +# +# DCS reserves organizationId=1 as the "system tenant" — a non-customer +# tenant whose view is read by Diality operators (not by customers). +# When a device has no customer-tenant association yet (the DCS +# /api/device/validate response carries associatedOrganizationId=null), +# CS routes the cs-log upload to this tenant instead of skipping the +# upload. The local rotated log therefore always reaches the cloud on +# the same heartbeat tick it was eligible for, regardless of whether +# the device has been bound to a customer tenant. Diagnostic data for +# the un-associated period is visible to Diality operators without +# being co-mingled into whichever customer tenant the device is later +# bound to. +CS_LOG_SYSTEM_TENANT_ORG_ID = 1 + # USER_AGENT USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36" CONTENT_TYPE = "application/json" Index: cloudsync/utils/heartbeat.py =================================================================== diff -u -r79dadd7422855193f345beced530c6b92fc49b3a -r8dda975c3b3303764304ed174177c22bbf0f6367 --- cloudsync/utils/heartbeat.py (.../heartbeat.py) (revision 79dadd7422855193f345beced530c6b92fc49b3a) +++ cloudsync/utils/heartbeat.py (.../heartbeat.py) (revision 8dda975c3b3303764304ed174177c22bbf0f6367) @@ -16,7 +16,7 @@ def __init__(self, logger: Logger, network_request_handler, output_channel, idempotent_network_request_handler=None, g_config=None, - cs_log_path=None): + cs_log_path=None, log_handler=None): self.logger = logger self.network_request_handler = network_request_handler self.output_channel = output_channel @@ -25,6 +25,13 @@ self._idempotent_handler = idempotent_network_request_handler self._g_config = g_config self._cs_log_path = cs_log_path or CS_LOG_PATH + # Force-rotate dep. Drives the silent-day rotation path on + # devices where logLevel=ERROR and no errors fire — without + # this, shouldRollover() is only called from inside emit() and + # such devices produce no rotated files at all, so the + # opportunistic-upload pathway above has nothing to send. Null + # is the same inert posture as the upload deps. + self._log_handler = log_handler # Progress marker; updated at the end of every heartbeat cycle. self.last_progress_ts = 0 self.thread = Thread(target=self.heartbeat, daemon=True) @@ -36,6 +43,14 @@ """ while True: if self.send_heartbeat: + # Force-rotate on a silent UTC-day boundary so the + # opportunistic-upload step below has a rotated file to + # send even on devices that aren't emitting (e.g. + # logLevel=ERROR + no errors). Runs before the upload + # step so a rotation fired on this tick can be picked + # up on the same tick. + self._force_rotate_if_due() + # Drop-oldest semantics: recent state is more useful than a # stale queued copy, so a "full" outcome is acceptable # (logged at debug; the next tick will try again). @@ -60,6 +75,21 @@ self.last_progress_ts = monotonic() sleep(self.HEARTBEAT_FREQ) + def _force_rotate_if_due(self): + """Invoke the configured log handler's force-rotate path. Null-safe. + + The handler swallows its own errors per its contract; this + wrapper is itself defensive so a missing or broken handler can + never break the heartbeat tick. + """ + if self._log_handler is None: + return + try: + self._log_handler.force_rotate_if_date_changed() + except Exception as exc: + self.logger.warning( + "Heartbeat force-rotate hook failed: %s", exc) + def _maybe_upload_pending_cs_log(self): """Enqueue the oldest rotated CS log file when conditions allow. Index: cloudsync/utils/helpers.py =================================================================== diff -u -r79dadd7422855193f345beced530c6b92fc49b3a -r8dda975c3b3303764304ed174177c22bbf0f6367 --- cloudsync/utils/helpers.py (.../helpers.py) (revision 79dadd7422855193f345beced530c6b92fc49b3a) +++ cloudsync/utils/helpers.py (.../helpers.py) (revision 8dda975c3b3303764304ed174177c22bbf0f6367) @@ -39,6 +39,22 @@ g_utils = GUtils() + +class TreatmentLogParseError(Exception): + """Raised by helpers_read_treatment_log_file for labelled failure modes. + + The first positional arg is the category, used by the 1007 handler to + select a curated, comma-free, customer-readable wire-frame string. + Internal cause (if any) is chained via `raise ... from e` and reaches + cloudsync.log via the handler's `log.exception(...)` — never the wire. + + Categories: + - "missing template" — treatment_report_template.json absent + - "missing input file" — caller-supplied .txr path absent + - "schema mismatch" — parse-loop failure on present input + """ + + CRC_LIST = [ 0, 49, 98, 83, 196, 245, 166, 151, 185, 136, 219, 234, 125, 76, 31, 46, 67, 114, 33, 16, 135, 182, 229, 212, 250, 203, 152, 169, 62, 15, 92, 109, @@ -106,8 +122,12 @@ g_utils.logger.warning(f"Unexpected conversion failure for: '{val}' — replacing with 0") return 0 if math.isinf(f) or math.isnan(f): - g_utils.logger.warning(f"Non-finite float value encountered: '{val}' — replacing with 0") - return 0 + # DCS recognizes -0.0001 as the non-finite sentinel across clinical + # telemetry fields (all physically non-negative on-device, so any + # negative value is unambiguously a sentinel). Single value covers + # -inf / +inf / NaN. + g_utils.logger.warning(f"Non-finite float value encountered: '{val}' — replacing with -0.0001") + return -0.0001 return f else: return val @@ -327,6 +347,15 @@ def helpers_read_treatment_log_file(path: str): + # Preflight: both the template and the caller-supplied .txr must exist + # before we touch the parser. Without this, missing inputs produce a + # KeyError / IndexError / TypeError from deep in the parse loop that + # the 1007 handler cannot translate into a customer-readable error. + if not os.path.exists(TREATMENT_REPORT_TEMPLATE_PATH): + raise TreatmentLogParseError("missing template") + if not os.path.exists(path): + raise TreatmentLogParseError("missing input file") + treatment_data = helpers_read_treatment_report_template(TREATMENT_REPORT_TEMPLATE_PATH) try: @@ -488,11 +517,15 @@ return treatment_data except IOError as er: - g_utils.logger.error('Opening treatment log file error: {0}'.format(' '.join(er.args))) - return None - except Exception as e: - g_utils.logger.error('Error parsing treatment file: {0}'.format(' '.join(e.args))) - return None + # File present at preflight but unreadable at open (permissions, + # mid-read deletion, etc.). Map to the same wire-frame category + # the caller dispatches for missing-input. + raise TreatmentLogParseError("missing input file") from er + except (KeyError, IndexError, TypeError, ValueError) as e: + # Parse loop encountered structurally-invalid content. The 1007 + # handler maps "schema mismatch" to a customer-readable string; + # full Python traceback reaches cloudsync.log via log.exception. + raise TreatmentLogParseError("schema mismatch") from e # Three-state backpressure. Index: cloudsync/utils/watchdog.py =================================================================== diff -u -r79dadd7422855193f345beced530c6b92fc49b3a -r8dda975c3b3303764304ed174177c22bbf0f6367 --- cloudsync/utils/watchdog.py (.../watchdog.py) (revision 79dadd7422855193f345beced530c6b92fc49b3a) +++ cloudsync/utils/watchdog.py (.../watchdog.py) (revision 8dda975c3b3303764304ed174177c22bbf0f6367) @@ -30,8 +30,13 @@ from time import monotonic, sleep -# Default sentinel file path — cs.py monitors this to trigger process restart -SENTINEL_PATH = "/media/sd-card/cloudsync/cloudsync_restart_sentinel" +# Default sentinel file path — cs.py monitors this to trigger process restart. +# Placed in a dedicated sentinel/ subfolder under the SD card root so the +# inotify watch on /media/sd-card/cloudsync (non-recursive, set up by +# FileInputBus for UI->CS bus files) does not fire on sentinel writes. The +# subfolder is auto-created at Watchdog __init__ time; isolated from the +# log/ subfolder so log-retention sweeps cannot reach it. +SENTINEL_PATH = "/media/sd-card/cloudsync/sentinel/cloudsync_restart_sentinel" class Watchdog: @@ -55,6 +60,22 @@ self._stop_event = Event() self._thread = Thread(target=self._monitor, daemon=True) + # Ensure the sentinel directory exists at construction time. The + # dedicated subfolder is isolated from FileInputBus's non-recursive + # inotify watch on the channels root and from log-retention sweeps + # of the log/ subfolder; first-run install / upgrade from a build + # whose layout did not include this folder is handled here so the + # release artifact does not need to ship the empty directory. + sentinel_dir = os.path.dirname(self._sentinel_path) + if sentinel_dir: + try: + os.makedirs(sentinel_dir, exist_ok=True) + except OSError as exc: + self._logger.warning( + "Watchdog: could not pre-create sentinel directory %s: %s", + sentinel_dir, exc, + ) + # ------------------------------------------------------------------ # Public API # ------------------------------------------------------------------ Index: cs.py =================================================================== diff -u -radec5be657a174b63971987fcabd492e58720712 -r8dda975c3b3303764304ed174177c22bbf0f6367 --- cs.py (.../cs.py) (revision adec5be657a174b63971987fcabd492e58720712) +++ cs.py (.../cs.py) (revision 8dda975c3b3303764304ed174177c22bbf0f6367) @@ -14,7 +14,7 @@ logger = logging.getLogger(__name__) DELAY = 0.5 -SENTINEL_FILE = "/media/sd-card/cloudsync/cloudsync_restart_sentinel" +SENTINEL_FILE = "/media/sd-card/cloudsync/sentinel/cloudsync_restart_sentinel" SENTINEL_CHECK_INTERVAL = 5 USAGE_FORMAT = "Usage: ./cs.py [debug|info|warning|error] [upgrade|update|]"