From ff8dc4d2385cabd052b56b7091bf2801e2e3a942 Mon Sep 17 00:00:00 2001 From: Ethan Byrd Date: Fri, 24 Apr 2026 18:10:58 -0700 Subject: [PATCH 1/2] Harden SIGHUP reload path - Move signal handler to flag-only; defer reload work to a daemon thread so file I/O and logger calls don't run on the signal stack (avoids potential deadlock with the Loki/file logging handler locks). - When a SIGHUP reload trips the mode-switch or missing-secret guard, preserve the live webhook fields but still apply the other reloadable values. Previous behaviour silently dropped unrelated edits made in the same file. - Replace int(...) on storage_upload_threshold with _safe_int helper that warns and falls back to default instead of crashing startup. - Warn at config-file load time if the file is group/world-readable, since it may contain webhook_secret and storage_key. - Tests: nine new cases covering the handler/worker split, the preserve-webhook reload path, _safe_int, and file-mode warnings. --- milter/primitivemail_milter.py | 139 +++++++++++++++++++++++++++------ test_milter.py | 106 ++++++++++++++++++++----- 2 files changed, 201 insertions(+), 44 deletions(-) diff --git a/milter/primitivemail_milter.py b/milter/primitivemail_milter.py index 41cce84..eaba870 100644 --- a/milter/primitivemail_milter.py +++ b/milter/primitivemail_milter.py @@ -20,6 +20,7 @@ import base64 import logging import signal +import threading import time import hashlib import ipaddress @@ -312,21 +313,48 @@ def recipient_filtering_enabled(self): _rcfg = ReloadableConfig() +def _warn_if_world_readable(path: str) -> None: + """Warn when the config file is group/world-readable. + + The file may hold webhook_secret and storage_key, so 0600 is the + expected mode. Non-fatal: we just log. + """ + try: + mode = os.stat(path).st_mode & 0o777 + except OSError: + return + if mode & 0o077: + logger.warning( + f"Config file {path} is group/world-readable (mode {mode:o}); " + f"recommend `chmod 0600` since it may contain webhook_secret/storage_key" + ) + + def _read_config_file(path: str) -> dict: """Read a JSON config file. Returns {} if the file does not exist or is not valid JSON (with a warning).""" try: with open(path) as f: data = json.load(f) - if not isinstance(data, dict): - logger.warning(f"Config file {path} is not a JSON object, ignoring") - return {} - return data except FileNotFoundError: return {} except (json.JSONDecodeError, ValueError, OSError) as e: logger.warning(f"Failed to read config file {path}: {e}") return {} + if not isinstance(data, dict): + logger.warning(f"Config file {path} is not a JSON object, ignoring") + return {} + _warn_if_world_readable(path) + return data + + +def _safe_int(value, default: int, name: str) -> int: + """Coerce value to int; on failure log and return default.""" + try: + return int(value) + except (TypeError, ValueError): + logger.warning(f"Invalid {name} value {value!r}; using default {default}") + return default def _cfg(file_data: dict, key: str, env_var: str, default=None): @@ -429,22 +457,47 @@ def _apply_config(file_data: dict, reloadable_only: bool = False): new_cfg = _build_reloadable_config(file_data) if reloadable_only: - # Guard: don't allow reload to flip between standalone and webhook mode. + # If webhook fields can't be safely applied, preserve them from the + # current config but still apply the other reloadable values. The + # previous "reject the whole reload" behaviour silently dropped + # unrelated edits (e.g. allowed_recipients) made in the same file. was_webhook = not STANDALONE_MODE would_be_webhook = bool(new_cfg.webhook_url) + preserve_webhook = False + if was_webhook != would_be_webhook: - logger.error(f"Config reload rejected: cannot switch modes via SIGHUP " - f"(current: {'webhook' if was_webhook else 'standalone'}, " - f"new config would be: {'webhook' if would_be_webhook else 'standalone'}). " - f"Restart the milter to change modes.") - return - - # Guard: webhook mode requires a secret - if would_be_webhook and not new_cfg.webhook_secret: - logger.error("Config reload rejected: webhook_url is set but webhook_secret is missing") - return - - # Atomic swap — one pointer assignment, GIL-atomic. Handler threads + logger.error( + f"Cannot switch modes via SIGHUP " + f"(current: {'webhook' if was_webhook else 'standalone'}, " + f"new config would be: {'webhook' if would_be_webhook else 'standalone'}). " + f"Webhook fields preserved; other reloadable values applied. " + f"Restart the milter to change modes." + ) + preserve_webhook = True + elif would_be_webhook and not new_cfg.webhook_secret: + logger.error( + "webhook_url is set but webhook_secret is missing. " + "Webhook fields preserved; other reloadable values applied." + ) + preserve_webhook = True + + if preserve_webhook: + old = _rcfg + new_cfg = ReloadableConfig( + webhook_url=old.webhook_url, + webhook_secret=old.webhook_secret, + webhook_extra_headers=old.webhook_extra_headers, + storage_url=new_cfg.storage_url, + storage_key=new_cfg.storage_key, + storage_auth_style=new_cfg.storage_auth_style, + allowed_sender_domains=new_cfg.allowed_sender_domains, + allowed_senders=new_cfg.allowed_senders, + allow_bounces=new_cfg.allow_bounces, + allowed_recipients=new_cfg.allowed_recipients, + spoof_protection=new_cfg.spoof_protection, + ) + + # Atomic swap, one pointer assignment, GIL-atomic. Handler threads # that already hold a reference to the old _rcfg continue using it # for the current message; new messages will pick up this new object. _rcfg = new_cfg @@ -455,8 +508,9 @@ def _apply_config(file_data: dict, reloadable_only: bool = False): # --- Non-reloadable values (startup only) --- MESSAGE_ID_DOMAIN = _cfg(file_data, 'mydomain', 'MYDOMAIN', 'primitivemail') MAIL_DIR = _cfg(file_data, 'mail_dir', 'MAIL_DIR', '/mail/incoming') - STORAGE_UPLOAD_THRESHOLD = int(_cfg(file_data, 'storage_upload_threshold', - 'STORAGE_UPLOAD_THRESHOLD', '3000000')) + STORAGE_UPLOAD_THRESHOLD = _safe_int( + _cfg(file_data, 'storage_upload_threshold', 'STORAGE_UPLOAD_THRESHOLD', '3000000'), + 3_000_000, 'storage_upload_threshold') STANDALONE_MODE = not _rcfg.webhook_url @@ -489,18 +543,50 @@ def _log_config_summary(): logger.info(f"Spamhaus DNSBL enabled: {SPAMHAUS_DNSBL_DOMAIN} (enforcing {SPAMHAUS_DROP_CODE})") +# Signalled by the SIGHUP handler; the reloader thread waits on this and +# performs the actual reload work off the signal-handler stack. +_reload_event = threading.Event() + + +def _sighup_handler(signum=None, frame=None): + """SIGHUP handler. Sets the reload event and returns immediately. + + Reason: signal handlers run on the main thread between bytecodes. + Doing file I/O, JSON parsing, or `logger.info` from inside a handler + can stall (or, with handlers that take locks like the Loki/file + logging handlers, deadlock) when a worker thread is mid-emit on the + same handler. Keep this minimal; let `_reload_worker` do the work. + """ + _reload_event.set() + + +def _reload_worker(): + """Background thread that performs config reloads when SIGHUP fires.""" + while True: + _reload_event.wait() + _reload_event.clear() + try: + reload_config() + except Exception as e: + logger.error(f"Config reload failed: {e}") + + def reload_config(signum=None, frame=None): """Reload hot-reloadable config values from the config file. - Called on SIGHUP. Re-reads the config file and builds a new - ReloadableConfig, then swaps it atomically. Non-reloadable values - (mydomain, mail_dir, etc.) are left unchanged. + Re-reads the config file and builds a new ReloadableConfig, then + swaps it atomically. Non-reloadable values (mydomain, mail_dir, + etc.) are left unchanged. If the config file was present at startup but is now missing or unreadable, the reload is aborted to prevent accidentally clearing file-only settings. + + Safe to call directly (e.g. from tests). In production it runs on + the reloader thread, signalled by `_sighup_handler`; the + `signum`/`frame` kwargs are accepted for legacy callers. """ - logger.info("SIGHUP received - reloading configuration") + logger.info("Reloading configuration") file_data = _read_config_file(CONFIG_FILE_PATH) if not file_data and _initial_file_data: logger.warning("Config file is missing or unreadable; " @@ -1670,8 +1756,11 @@ def main(): """Start the milter""" global METRICS_ENABLED - # Register SIGHUP handler for config reload - signal.signal(signal.SIGHUP, reload_config) + # Register SIGHUP handler for config reload. The handler itself only + # sets an event; the reloader thread does the actual work so file I/O + # and logging don't run on the signal-handler stack. + signal.signal(signal.SIGHUP, _sighup_handler) + threading.Thread(target=_reload_worker, daemon=True, name='config-reloader').start() # Socket configuration # Use TCP socket for easier Docker networking diff --git a/test_milter.py b/test_milter.py index b090317..6680442 100644 --- a/test_milter.py +++ b/test_milter.py @@ -10,6 +10,7 @@ import json import os import sys +import threading from unittest.mock import patch, MagicMock, PropertyMock from urllib.error import HTTPError, URLError from io import BytesIO @@ -2176,8 +2177,8 @@ def test_reload_survives_corrupt_config_file(self, tmp_path): pm.CONFIG_FILE_PATH = original_path self._restore_state(saved) - def test_reload_rejects_mode_switch_webhook_to_standalone(self, tmp_path): - """Cannot switch from webhook to standalone mode via SIGHUP.""" + def test_reload_preserves_webhook_on_mode_switch_attempt(self, tmp_path): + """Mode switch via SIGHUP preserves webhook fields, applies the rest.""" saved = self._save_state() original_path = pm.CONFIG_FILE_PATH try: @@ -2189,22 +2190,28 @@ def test_reload_rejects_mode_switch_webhook_to_standalone(self, tmp_path): pm.CONFIG_FILE_PATH = str(cfg_path) pm._apply_config(pm._read_config_file(str(cfg_path)), reloadable_only=False) assert pm.STANDALONE_MODE is False - assert pm._rcfg.webhook_url == "https://webhook-mode.com" - # Try to remove webhook_url via reload — should be rejected - cfg_path.write_text(json.dumps({})) + # File now drops webhook_url (mode switch) but adds filtering. + cfg_path.write_text(json.dumps({ + "allowed_recipients": ["inbox@myco.com"], + })) with patch.dict(os.environ, {k: v for k, v in os.environ.items() - if k != 'WEBHOOK_URL'}, clear=True): + if k not in ('WEBHOOK_URL', 'WEBHOOK_SECRET', + 'ALLOWED_RECIPIENTS')}, clear=True): pm.reload_config() - # Should still be the original URL + # Webhook fields preserved. assert pm._rcfg.webhook_url == "https://webhook-mode.com" + assert pm._rcfg.webhook_secret == "secret" + # Non-webhook reloadable values still applied. + assert pm._rcfg.allowed_recipients == {"inbox@myco.com"} + assert pm._rcfg.recipient_filtering_enabled is True finally: pm.CONFIG_FILE_PATH = original_path self._restore_state(saved) - def test_reload_rejects_missing_secret(self, tmp_path): - """Cannot reload with webhook_url but no webhook_secret.""" + def test_reload_preserves_webhook_on_missing_secret(self, tmp_path): + """Reload with webhook_url but no secret preserves webhook, applies the rest.""" saved = self._save_state() original_path = pm.CONFIG_FILE_PATH try: @@ -2216,27 +2223,30 @@ def test_reload_rejects_missing_secret(self, tmp_path): pm.CONFIG_FILE_PATH = str(cfg_path) pm._apply_config(pm._read_config_file(str(cfg_path)), reloadable_only=False) - # Reload with URL but no secret — should be rejected + # Bad webhook (no secret) plus a valid filtering edit in the same file. cfg_path.write_text(json.dumps({ "webhook_url": "https://new-url.com", + "allowed_sender_domains": ["trusted.com"], })) with patch.dict(os.environ, {k: v for k, v in os.environ.items() - if k != 'WEBHOOK_SECRET'}, clear=True): + if k not in ('WEBHOOK_SECRET', + 'ALLOWED_SENDER_DOMAINS')}, clear=True): pm.reload_config() - # Should still be the original values + # Webhook preserved. assert pm._rcfg.webhook_url == "https://original.com" assert pm._rcfg.webhook_secret == "original-secret" + # Non-webhook edit still applied. + assert pm._rcfg.allowed_sender_domains == {"trusted.com"} finally: pm.CONFIG_FILE_PATH = original_path self._restore_state(saved) - def test_reload_callable_as_signal_handler(self, tmp_path): - """reload_config accepts signum and frame args (signal handler signature).""" + def test_reload_callable_directly(self, tmp_path): + """reload_config is callable directly and accepts the legacy signal kwargs.""" saved = self._save_state() original_path = pm.CONFIG_FILE_PATH try: - # Start in webhook mode so reload doesn't hit mode-switch guard cfg_path = tmp_path / "milter.json" cfg_path.write_text(json.dumps({ "webhook_url": "https://initial.com", @@ -2245,19 +2255,77 @@ def test_reload_callable_as_signal_handler(self, tmp_path): pm.CONFIG_FILE_PATH = str(cfg_path) pm._apply_config(pm._read_config_file(str(cfg_path)), reloadable_only=False) - # Update config and reload with signal handler signature cfg_path.write_text(json.dumps({ - "webhook_url": "https://signal-test.com", - "webhook_secret": "sig-secret", + "webhook_url": "https://reloaded.com", + "webhook_secret": "reloaded-secret", })) pm.reload_config(signum=1, frame=None) - assert pm._rcfg.webhook_url == "https://signal-test.com" + assert pm._rcfg.webhook_url == "https://reloaded.com" finally: pm.CONFIG_FILE_PATH = original_path self._restore_state(saved) +class TestSighupHandler: + """The signal handler itself does no real work; the reloader thread does.""" + + def test_handler_only_sets_event(self): + pm._reload_event.clear() + try: + pm._sighup_handler(signum=1, frame=None) + assert pm._reload_event.is_set() + finally: + pm._reload_event.clear() + + def test_reload_worker_consumes_event_and_reloads(self, tmp_path): + """The reloader thread waits on the event and runs reload_config.""" + fake_done = threading.Event() + original_reload = pm.reload_config + + def fake_reload(*args, **kwargs): + fake_done.set() + + pm.reload_config = fake_reload + pm._reload_event.clear() + try: + threading.Thread(target=pm._reload_worker, daemon=True).start() + pm._reload_event.set() + assert fake_done.wait(timeout=1.0), "reloader thread did not call reload_config" + finally: + pm.reload_config = original_reload + pm._reload_event.clear() + + +class TestSafeInt: + def test_safe_int_valid(self): + assert pm._safe_int("42", 100, "x") == 42 + assert pm._safe_int(42, 100, "x") == 42 + + def test_safe_int_falls_back_on_garbage(self): + assert pm._safe_int("not-a-number", 100, "x") == 100 + assert pm._safe_int(None, 100, "x") == 100 + assert pm._safe_int([], 100, "x") == 100 + + +class TestConfigFilePerms: + def test_world_readable_config_logs_warning(self, tmp_path, caplog): + cfg_path = tmp_path / "milter.json" + cfg_path.write_text(json.dumps({"webhook_url": "https://x.com"})) + os.chmod(str(cfg_path), 0o644) + with caplog.at_level("WARNING"): + pm._read_config_file(str(cfg_path)) + assert any("group/world-readable" in r.message for r in caplog.records) + + def test_owner_only_config_no_warning(self, tmp_path, caplog): + cfg_path = tmp_path / "milter.json" + cfg_path.write_text(json.dumps({"webhook_url": "https://x.com"})) + os.chmod(str(cfg_path), 0o600) + with caplog.at_level("WARNING"): + pm._read_config_file(str(cfg_path)) + assert not any("group/world-readable" in r.message for r in caplog.records) + + class TestEndToEndConfigFile: """Integration: config file → milter behaviour.""" From 3f2a343b8eceeea4b310bf81dcb7a3151367bf2f Mon Sep 17 00:00:00 2001 From: Ethan Byrd Date: Fri, 24 Apr 2026 18:30:32 -0700 Subject: [PATCH 2/2] Address lint and Greptile review - Drop the redundant inline `import threading` in main(); the module-level import means ruff's F823 was treating the second binding as making threading a function-local before its first use. - Use logger.exception in _reload_worker so failed reloads surface a full traceback instead of just the message. - Extract _run_one_reload so the worker test can drive a single iteration on a thread that exits cleanly, rather than orphaning a daemon thread that might wake up on _reload_event in a later test. --- milter/primitivemail_milter.py | 24 ++++++++++++++++-------- test_milter.py | 8 +++++++- 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/milter/primitivemail_milter.py b/milter/primitivemail_milter.py index eaba870..0bd0d4a 100644 --- a/milter/primitivemail_milter.py +++ b/milter/primitivemail_milter.py @@ -560,15 +560,25 @@ def _sighup_handler(signum=None, frame=None): _reload_event.set() +def _run_one_reload(): + """Wait for the next reload signal and perform one reload. + + Extracted from the worker loop so tests can drive a single iteration + on a thread that then exits cleanly, avoiding orphan threads that + might wake up on later test events. + """ + _reload_event.wait() + _reload_event.clear() + try: + reload_config() + except Exception: + logger.exception("Config reload failed") + + def _reload_worker(): """Background thread that performs config reloads when SIGHUP fires.""" while True: - _reload_event.wait() - _reload_event.clear() - try: - reload_config() - except Exception as e: - logger.error(f"Config reload failed: {e}") + _run_one_reload() def reload_config(signum=None, frame=None): @@ -1801,8 +1811,6 @@ def main(): # Start background thread to poll active smtpd process count if METRICS_ENABLED: - import threading - def _poll_smtpd_count(): import subprocess _logged_failure = False diff --git a/test_milter.py b/test_milter.py index 6680442..6008456 100644 --- a/test_milter.py +++ b/test_milter.py @@ -2289,9 +2289,15 @@ def fake_reload(*args, **kwargs): pm.reload_config = fake_reload pm._reload_event.clear() try: - threading.Thread(target=pm._reload_worker, daemon=True).start() + # Run a single iteration so the worker thread exits cleanly + # after the test instead of orphaning a daemon that could + # wake on _reload_event in a later test. + t = threading.Thread(target=pm._run_one_reload, daemon=True) + t.start() pm._reload_event.set() assert fake_done.wait(timeout=1.0), "reloader thread did not call reload_config" + t.join(timeout=1.0) + assert not t.is_alive(), "reload iteration did not exit" finally: pm.reload_config = original_reload pm._reload_event.clear()