From d9291028a8b4376cf98c6d55287ab83274962231 Mon Sep 17 00:00:00 2001 From: Antonio Nuno Monteiro Date: Thu, 2 Apr 2026 22:03:25 -0700 Subject: [PATCH 1/2] test(scheduler): repro SIGCHLD wake-up corruption under high concurrency Signed-off-by: Antonio Nuno Monteiro --- .../dune_scheduler/async_io_tests.ml | 80 ++++++++++ test/expect-tests/dune_scheduler/dune | 13 +- .../dune_scheduler/signal_cleanup_repro.ml | 144 ++++++++++++++++++ .../dune_scheduler/signal_cleanup_repro.mli | 0 4 files changed, 234 insertions(+), 3 deletions(-) create mode 100644 test/expect-tests/dune_scheduler/signal_cleanup_repro.ml create mode 100644 test/expect-tests/dune_scheduler/signal_cleanup_repro.mli diff --git a/test/expect-tests/dune_scheduler/async_io_tests.ml b/test/expect-tests/dune_scheduler/async_io_tests.ml index c908d4e5b97..7549fa445f8 100644 --- a/test/expect-tests/dune_scheduler/async_io_tests.ml +++ b/test/expect-tests/dune_scheduler/async_io_tests.ml @@ -9,6 +9,75 @@ let config = } ;; +let string_contains s needle = + let len_s = String.length s in + let len_needle = String.length needle in + let rec loop i = + i + len_needle <= len_s + && (String.sub s ~pos:i ~len:len_needle = needle || loop (i + 1)) + in + loop 0 +;; + +let print_relevant_stderr stderr = + String.split_lines stderr + |> List.filter ~f:(fun line -> + string_contains line "signal watcher received an unexpected signal") + |> List.iter ~f:print_endline +;; + +let signal_cleanup_repro_prog = + let inline_test_dir = Filename.dirname Sys.executable_name in + let build_dir = Filename.dirname inline_test_dir in + let prog = Filename.concat build_dir "signal_cleanup_repro.exe" in + if Sys.file_exists prog + then prog + else + Code_error.raise + "could not locate signal cleanup repro executable" + [ "cwd", Dyn.string (Sys.getcwd ()) + ; "test_executable", Dyn.string Sys.executable_name + ; "expected", Dyn.string prog + ] +;; + +let signal_cleanup_repro_env () = + let env = + Env.initial + |> Env.add ~var:"DUNE_SIGNAL_REPRO_SHUTDOWN" ~value:"signal" + |> Env.add ~var:"DUNE_SIGNAL_REPRO_ITERS" ~value:"1" + |> Env.add ~var:"DUNE_SIGNAL_REPRO_JOBS" ~value:"128" + |> Env.add ~var:"DUNE_SIGNAL_REPRO_DELAY" ~value:"0.002" + in + Env.to_unix env |> Array.of_list +;; + +let run_signal_cleanup_repro () = + let ic, oc, ec = + Unix.open_process_args_full + signal_cleanup_repro_prog + [| signal_cleanup_repro_prog |] + (signal_cleanup_repro_env ()) + in + close_out oc; + let stdout = In_channel.input_all ic in + let stderr = In_channel.input_all ec in + match Unix.close_process_full (ic, oc, ec) with + | WEXITED 0 -> + let stdout = String.trim stdout in + if String.equal stdout "ok after 1 iterations" + then true + else ( + ignore stdout; + print_relevant_stderr stderr; + false) + | status -> + ignore stdout; + ignore status; + print_relevant_stderr stderr; + false +;; + let%expect_test "read readiness" = (Scheduler.Run.go config ~on_event:(fun _ _ -> ()) @@ fun () -> @@ -91,3 +160,14 @@ let%expect_test "cancel task" = (fun () -> Async_io.Task.cancel task)); [%expect {| successfully cancelled |}] ;; + +let%expect_test "SIGCHLD wakeups survive interrupted high-concurrency builds" = + let rec loop n = + if n = 0 + then print_endline "passed 20 fresh interrupted runs" + else if run_signal_cleanup_repro () + then loop (n - 1) + in + loop 20; + [%expect {| passed 20 fresh interrupted runs |}] +;; diff --git a/test/expect-tests/dune_scheduler/dune b/test/expect-tests/dune_scheduler/dune index 22eedc9e0fe..5fd447f8104 100644 --- a/test/expect-tests/dune_scheduler/dune +++ b/test/expect-tests/dune_scheduler/dune @@ -1,6 +1,8 @@ (library (name dune_scheduler_tests) - (inline_tests) + (modules async_io_tests) + (inline_tests + (deps signal_cleanup_repro.exe)) (preprocess (pps ppx_expect)) (libraries @@ -9,9 +11,14 @@ unix threads.posix fiber - ;; This is because of the (implicit_transitive_deps false) - ;; in dune-project + ; This is because of the (implicit_transitive_deps false) + ; in dune-project ppx_expect.config ppx_expect.config_types base ppx_inline_test.config)) + +(executable + (name signal_cleanup_repro) + (modules signal_cleanup_repro) + (libraries stdune dune_scheduler unix threads.posix fiber)) diff --git a/test/expect-tests/dune_scheduler/signal_cleanup_repro.ml b/test/expect-tests/dune_scheduler/signal_cleanup_repro.ml new file mode 100644 index 00000000000..546589fa44b --- /dev/null +++ b/test/expect-tests/dune_scheduler/signal_cleanup_repro.ml @@ -0,0 +1,144 @@ +open Stdune +module Event = Dune_scheduler__Event +module Process_watcher = Dune_scheduler__Process_watcher +module Signal_watcher = Dune_scheduler__Signal_watcher +module Thread0 = Dune_scheduler__Thread0 + +let debug = + match Sys.getenv_opt "DUNE_SIGNAL_REPRO_DEBUG" with + | None -> fun _ -> () + | Some _ -> prerr_endline +;; + +let env_int name ~default = + match Sys.getenv_opt name with + | None -> default + | Some value -> Int.of_string value |> Option.value_exn +;; + +let env_float name ~default = + match Sys.getenv_opt name with + | None -> default + | Some value -> Float.of_string value |> Option.value_exn +;; + +let sh = + Bin.which "sh" ~path:(Env_path.path Env.initial) + |> Option.map ~f:Path.to_string + |> Option.value_exn +;; + +let spawn_sleep command = + let argv = [| sh; "-c"; command |] in + Unix.create_process sh argv Unix.stdin Unix.stdout Unix.stderr |> Pid.of_int +;; + +let reset_signal_mask () = + let signos = List.map Thread0.interrupt_signals ~f:Signal.to_int in + ignore (Unix.sigprocmask SIG_UNBLOCK signos : int list) +;; + +let jobs_per_iteration = env_int "DUNE_SIGNAL_REPRO_JOBS" ~default:64 +let iterations = env_int "DUNE_SIGNAL_REPRO_ITERS" ~default:100 +let interrupt_after = env_float "DUNE_SIGNAL_REPRO_DELAY" ~default:0.005 +let timeout_after = env_float "DUNE_SIGNAL_REPRO_TIMEOUT" ~default:5.0 +let commands = [| "sleep 0.01"; "sleep 0.02"; "sleep 0.03"; "sleep 0.05" |] + +type shutdown_mode = + | Queue + | Signal + +let shutdown_mode = + match Sys.getenv_opt "DUNE_SIGNAL_REPRO_SHUTDOWN" with + | Some "signal" -> Signal + | Some "queue" | None -> Queue + | Some mode -> + Code_error.raise "unknown signal repro shutdown mode" [ "mode", Dyn.string mode ] +;; + +let cleanup_iteration events process_watcher = + debug "starting iteration"; + let pids = + List.init jobs_per_iteration ~f:(fun i -> + spawn_sleep commands.(i mod Array.length commands)) + in + List.iter pids ~f:(fun pid -> + let job : Event.job = + { pid; is_process_group_leader = false; ivar = Fiber.Ivar.create () } + in + Process_watcher.register_job process_watcher job); + let (_ : Thread.t) = + Thread.create + (fun () -> + Thread.delay interrupt_after; + match shutdown_mode with + | Queue -> Event.Queue.send_shutdown events (Signal Int) + | Signal -> Unix.kill (Unix.getpid ()) Sys.sigint) + () + in + let rec wait_for_shutdown () = + match Event.Queue.next events with + | Shutdown (Signal Int) -> () + | Shutdown _ -> wait_for_shutdown () + | Job_complete_ready -> + ignore (Process_watcher.wait_unix process_watcher : Fiber.fill list); + wait_for_shutdown () + | Fiber_fill_ivar _ + | File_watcher_task _ + | Build_inputs_changed _ + | File_system_sync _ + | File_system_watcher_terminated -> wait_for_shutdown () + in + wait_for_shutdown (); + debug "got shutdown"; + Process_watcher.killall process_watcher Sys.sigkill; + while Event.Queue.pending_jobs events > 0 do + match Event.Queue.next events with + | Shutdown _ -> () + | Job_complete_ready -> + ignore (Process_watcher.wait_unix process_watcher : Fiber.fill list) + | Fiber_fill_ivar _ + | File_watcher_task _ + | Build_inputs_changed _ + | File_system_sync _ + | File_system_watcher_terminated -> () + done +;; + +let () = + Printexc.record_backtrace true; + try + reset_signal_mask (); + let events = Event.Queue.create () in + debug "created queue"; + let signal_watcher = Signal_watcher.init ~print_ctrl_c_warning:false events in + debug "started signal watcher"; + let finished = ref false in + let (_ : Thread.t) = + Thread.create + (fun () -> + Thread.delay timeout_after; + if not !finished + then ( + prerr_endline + "signal watcher cleanup repro timed out while exercising SIGINT"; + exit 2)) + () + in + let process_watcher = Process_watcher.init events in + debug "started process watcher"; + for _ = 1 to iterations do + cleanup_iteration events process_watcher + done; + debug "finished iterations"; + Unix.kill (Unix.getpid ()) (Signal.to_int Thread0.signal_watcher_interrupt); + Thread0.join signal_watcher; + finished := true; + debug "joined signal watcher"; + Printf.printf "ok after %d iterations\n%!" iterations + with + | exn -> + prerr_endline (Printexc.to_string exn); + prerr_string (Printexc.get_backtrace ()); + exit 1 +;; diff --git a/test/expect-tests/dune_scheduler/signal_cleanup_repro.mli b/test/expect-tests/dune_scheduler/signal_cleanup_repro.mli new file mode 100644 index 00000000000..e69de29bb2d From c5c65a864f508e659fa34ff505b3184111c1cd0f Mon Sep 17 00:00:00 2001 From: Antonio Nuno Monteiro Date: Fri, 3 Apr 2026 15:07:39 -0700 Subject: [PATCH 2/2] fix: invalid SIGCHLD during high-concurrency interrupted builds Signed-off-by: Antonio Nuno Monteiro --- otherlibs/stdune/src/signal_stubs.c | 2 +- src/dune_scheduler/dune | 4 +-- src/dune_scheduler/process_watcher.ml | 9 ++++--- src/dune_scheduler/process_watcher.mli | 1 + src/dune_scheduler/scheduler.ml | 1 + src/dune_scheduler/signal_watcher.ml | 16 +++++++++--- src/dune_scheduler/thread0.ml | 7 ++--- src/dune_scheduler/thread0.mli | 4 +-- .../dune_scheduler/async_io_tests.ml | 26 ++++++------------- 9 files changed, 37 insertions(+), 33 deletions(-) diff --git a/otherlibs/stdune/src/signal_stubs.c b/otherlibs/stdune/src/signal_stubs.c index e734b189c50..ccb757d4acf 100644 --- a/otherlibs/stdune/src/signal_stubs.c +++ b/otherlibs/stdune/src/signal_stubs.c @@ -1,8 +1,8 @@ #include #ifdef _WIN32 -#include #else +#include #include #include #endif diff --git a/src/dune_scheduler/dune b/src/dune_scheduler/dune index 63286f5b03a..9d6c1f21cbc 100644 --- a/src/dune_scheduler/dune +++ b/src/dune_scheduler/dune @@ -14,6 +14,6 @@ fiber memo threads.posix - dune_trace - dune_util) + dune_trace + dune_util) (synopsis "Internal Dune library, do not use!")) diff --git a/src/dune_scheduler/process_watcher.ml b/src/dune_scheduler/process_watcher.ml index 43dd701a5c8..3d7a2bed50e 100644 --- a/src/dune_scheduler/process_watcher.ml +++ b/src/dune_scheduler/process_watcher.ml @@ -44,8 +44,6 @@ let dyn_of_process_state = function | Zombie _ -> Dyn.variant "Zombie" [] ;; -(* This mutable table is safe: it does not interact with the state we track in - the build system. *) type t = { mutex : Mutex.t Lazy.t ; something_is_running : Condition.t option @@ -214,7 +212,12 @@ let init events = in if Sys.win32 then ( - let (_ : Thread.t) = Thread0.spawn ~name:"process-watcher" (fun () -> run_win32 t) in + let (_ : Thread.t) = + Thread0.spawn ~name:"process-watcher" (fun () -> + run_win32 t) + in ()); t ;; + +let shutdown (_ : t) = () diff --git a/src/dune_scheduler/process_watcher.mli b/src/dune_scheduler/process_watcher.mli index 2282fe8f6a2..06c957afa91 100644 --- a/src/dune_scheduler/process_watcher.mli +++ b/src/dune_scheduler/process_watcher.mli @@ -17,3 +17,4 @@ val is_running : t -> Pid.t -> bool val killall : t -> int -> unit val wait_unix : t -> Fiber.fill list +val shutdown : t -> unit diff --git a/src/dune_scheduler/scheduler.ml b/src/dune_scheduler/scheduler.ml index a57948b34a3..6ab11f2385a 100644 --- a/src/dune_scheduler/scheduler.ml +++ b/src/dune_scheduler/scheduler.ml @@ -183,6 +183,7 @@ let kill_and_wait_for_all_processes t = then ( Unix.kill (Unix.getpid ()) (Signal.to_int Thread.signal_watcher_interrupt); Thread.join t.signal_watcher); + Process_watcher.shutdown t.process_watcher; !saw_signal ;; diff --git a/src/dune_scheduler/signal_watcher.ml b/src/dune_scheduler/signal_watcher.ml index c4538090fe1..3bafbdfad79 100644 --- a/src/dune_scheduler/signal_watcher.ml +++ b/src/dune_scheduler/signal_watcher.ml @@ -2,6 +2,16 @@ open Stdune let signos = List.map Thread0.interrupt_signals ~f:Signal.to_int +let macos_sigchld_warning = + {| + +***************************************** +* Ignoring invalid macOS SIGCHLD return * +***************************************** + +|} +;; + let warning = {| @@ -55,10 +65,8 @@ let run ~print_ctrl_c_warning q : unit = if n = 2 && print_ctrl_c_warning then prerr_endline warning; if n = 3 then sys_exit 1 | _ -> - (* we only blocked the signals above *) - Code_error.raise - "signal watcher received an unexpected signal" - [ "signal", Signal.to_dyn signal ] + prerr_endline macos_sigchld_warning; + Event.Queue.send_job_completed_ready q done ;; diff --git a/src/dune_scheduler/thread0.ml b/src/dune_scheduler/thread0.ml index eb858f7a914..ebffee05fcb 100644 --- a/src/dune_scheduler/thread0.ml +++ b/src/dune_scheduler/thread0.ml @@ -9,13 +9,14 @@ let signal_watcher_interrupt : Signal.t = Usr1 let signal_watcher_debug : Signal.t = Usr2 (* These are the signals that will make the scheduler attempt to terminate dune - or signal to dune to reap a process *) + or signal to dune to reap a process. +*) let interrupt_signals : Signal.t list = [ signal_watcher_interrupt; signal_watcher_debug; Chld; Int; Quit; Term ] ;; (* In addition, the scheduler also blocks some other signals so that only - designated threads can handle them by unblocking *) + designated threads can handle them by unblocking. *) let blocked_signals : Signal.t list = Terminal_signals.signals @ interrupt_signals let block_signals = @@ -35,7 +36,7 @@ let create = then Thread.create else (* On unix, we make sure to block signals globally before starting a - thread so that only the signal watcher thread can receive signals. *) + thread so that only the signal watcher thread can receive signals. *) fun f x -> Lazy.force block_signals; Thread.create f x diff --git a/src/dune_scheduler/thread0.mli b/src/dune_scheduler/thread0.mli index 932d8c00874..22ca8cd0058 100644 --- a/src/dune_scheduler/thread0.mli +++ b/src/dune_scheduler/thread0.mli @@ -8,8 +8,8 @@ val signal_watcher_interrupt : Signal.t (** Magic signal to make dune debugging info *) val signal_watcher_debug : Signal.t -val join : t -> unit val interrupt_signals : Signal.t list +val join : t -> unit +val wait_signal : int list -> int val spawn : name:string -> (unit -> unit) -> Thread.t val delay : float -> unit -val wait_signal : int list -> int diff --git a/test/expect-tests/dune_scheduler/async_io_tests.ml b/test/expect-tests/dune_scheduler/async_io_tests.ml index 7549fa445f8..f4ef6ec57ab 100644 --- a/test/expect-tests/dune_scheduler/async_io_tests.ml +++ b/test/expect-tests/dune_scheduler/async_io_tests.ml @@ -9,21 +9,13 @@ let config = } ;; -let string_contains s needle = - let len_s = String.length s in - let len_needle = String.length needle in - let rec loop i = - i + len_needle <= len_s - && (String.sub s ~pos:i ~len:len_needle = needle || loop (i + 1)) +let print_relevant_output stdout stderr = + let print_if_nonempty s = + let s = String.trim s in + if not (String.is_empty s) then print_endline s in - loop 0 -;; - -let print_relevant_stderr stderr = - String.split_lines stderr - |> List.filter ~f:(fun line -> - string_contains line "signal watcher received an unexpected signal") - |> List.iter ~f:print_endline + print_if_nonempty stdout; + print_if_nonempty stderr ;; let signal_cleanup_repro_prog = @@ -68,13 +60,11 @@ let run_signal_cleanup_repro () = if String.equal stdout "ok after 1 iterations" then true else ( - ignore stdout; - print_relevant_stderr stderr; + print_relevant_output stdout stderr; false) | status -> - ignore stdout; ignore status; - print_relevant_stderr stderr; + print_relevant_output stdout stderr; false ;;