From 6f58bb17257dc6484da260168e733d5d2b6c1292 Mon Sep 17 00:00:00 2001 From: JakeRadMSFT Date: Sat, 7 Mar 2026 23:43:54 -0800 Subject: [PATCH 1/2] Build coordinator for system-wide MSBuild node management Fix 3 macOS/Unix node reuse bugs: - Node reuse handshake timeout was 0ms (poll-only), increased to 1000ms - getsid() returns per-terminal values on Unix, breaking cross-terminal reuse; use sessionId=0 - ClientConnectTimeout was 60s, reduced to 5s Add hash-based pipe naming on Unix: - Nodes create pipes as MSBuild-{hash}-{PID} for O(1) discovery - Replaces probing all dotnet processes on the system Add build coordinator (dotnet msbuild --coordinator): - System-wide node budget management across concurrent MSBuild instances - Epoch-based heartbeat-gated promotion prevents budget overrun - Fair-share budget with dynamic rebalancing as builds start/finish - PID-aware staleness reaper for crashed builds - Pipe file watchdog for self-healing - ShutdownExcessNodes for mid-build node trimming - Coordinator-aware node lifecycle (no-reuse on shutdown) Results: 10 concurrent builds peak at 11 nodes (was 55+), immediate cleanup. --- .../BuildCoordinator/BuildCoordinator.cs | 688 ++++++++++++++++++ .../BuildCoordinatorClient.cs | 217 ++++++ .../BackEnd/BuildManager/BuildManager.cs | 99 ++- .../Components/Communications/INodeManager.cs | 5 + .../Communications/NodeEndpointOutOfProc.cs | 8 +- .../Components/Communications/NodeManager.cs | 8 + .../Communications/NodeProviderOutOfProc.cs | 20 + .../NodeProviderOutOfProcBase.cs | 44 +- .../Communications/TaskHostNodeManager.cs | 5 + src/Build/Microsoft.Build.csproj | 2 + src/MSBuild/XMake.cs | 67 ++ src/Shared/CommunicationsUtilities.cs | 53 +- src/Shared/NamedPipeUtil.cs | 51 ++ src/Shared/NodeEndpointOutOfProcBase.cs | 4 +- 14 files changed, 1241 insertions(+), 30 deletions(-) create mode 100644 src/Build/BackEnd/BuildCoordinator/BuildCoordinator.cs create mode 100644 src/Build/BackEnd/BuildCoordinator/BuildCoordinatorClient.cs diff --git a/src/Build/BackEnd/BuildCoordinator/BuildCoordinator.cs b/src/Build/BackEnd/BuildCoordinator/BuildCoordinator.cs new file mode 100644 index 00000000000..f5b5e786cae --- /dev/null +++ b/src/Build/BackEnd/BuildCoordinator/BuildCoordinator.cs @@ -0,0 +1,688 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.Diagnostics; +using System.IO; +using System.IO.Pipes; +using System.Linq; +using System.Threading; +using System.Threading.Tasks; + +namespace Microsoft.Build.BackEnd +{ + /// + /// A standalone coordinator process that manages node budgets across concurrent MSBuild instances. + /// + /// Key design: heartbeat-gated promotion. When a new build registers and there are already + /// active builds, the new build is queued. Active builds learn their reduced budget on their + /// next heartbeat. Only after ALL active builds have acknowledged the reduction (via heartbeat) + /// is the new build promoted. This prevents temporarily exceeding the node budget. + /// + /// Protocol (line-based text over named pipe): + /// REGISTER buildId requestedNodes + /// → OK grantedNodes (first build — immediate) + /// → QUEUED position totalQueued (subsequent builds — wait for heartbeat gate) + /// + /// HEARTBEAT buildId + /// → OK grantedNodes (active build — may have new budget) + /// → QUEUED position totalQueued waitSec (queued build — position update) + /// + /// UNREGISTER buildId + /// → OK [promoted buildId] (promotes next queued build if any) + /// + /// STATUS + /// → Multi-line summary of active + queued builds + /// + /// SHUTDOWN + /// → OK + /// + public sealed class BuildCoordinator : IDisposable + { + /// + /// Well-known pipe name. All MSBuild instances for this user connect here. + /// On Unix: /tmp/MSBuild-Coordinator-{username} + /// On Windows: MSBuild-Coordinator-{username} + /// + internal static string GetPipeName() + { + string user = Environment.UserName; + string pipeName = $"MSBuild-Coordinator-{user}"; + + if (NativeMethodsShared.IsUnixLike) + { + return $"/tmp/{pipeName}"; + } + + return pipeName; + } + + private readonly int _totalBudget; + private readonly int _maxConcurrentBuilds; + private readonly ConcurrentDictionary _activeBuilds = new(); + private readonly List _queuedBuilds = new(); + private readonly object _queueLock = new(); + private readonly CancellationTokenSource _cts = new(); + private Task? _listenTask; + private Timer? _stalenessReaper; + private Timer? _pipeWatchdog; + private CancellationTokenSource? _listenCycleCts; + + /// + /// Epoch counter — bumped whenever the budget landscape changes and active builds + /// need to acknowledge their new budget before queued builds can be promoted. + /// + private int _rebalanceEpoch; + + /// + /// If a build hasn't heartbeated in this many seconds, consider it dead. + /// + private const int StaleHeartbeatSeconds = 10; + + public BuildCoordinator(int totalBudget, int maxConcurrentBuilds) + { + _totalBudget = totalBudget; + _maxConcurrentBuilds = maxConcurrentBuilds; + } + + /// + /// Start listening for MSBuild client connections. + /// + public void Start() + { + string pipeName = GetPipeName(); + + // On Unix, clean up stale pipe file + if (NativeMethodsShared.IsUnixLike && File.Exists(pipeName)) + { + File.Delete(pipeName); + } + + Console.WriteLine($"Build Coordinator starting"); + Console.WriteLine($" Pipe: {pipeName}"); + Console.WriteLine($" Budget: {_totalBudget} nodes"); + Console.WriteLine($" Max concurrent builds: {_maxConcurrentBuilds}"); + + _listenTask = Task.Run(() => ListenLoop(_cts.Token)); + + // Periodically reap builds that stopped heartbeating (crashed/killed process) + _stalenessReaper = new Timer(ReapStaleBuilds, null, TimeSpan.FromSeconds(5), TimeSpan.FromSeconds(5)); + + // Watch for pipe file deletion (e.g. by overzealous cleanup scripts) + if (NativeMethodsShared.IsUnixLike) + { + _pipeWatchdog = new Timer(CheckPipeHealth, null, TimeSpan.FromSeconds(2), TimeSpan.FromSeconds(2)); + } + } + + /// + /// Stop the coordinator and clean up. + /// + public void Stop() + { + _pipeWatchdog?.Dispose(); + _pipeWatchdog = null; + _stalenessReaper?.Dispose(); + _stalenessReaper = null; + _cts.Cancel(); + _listenCycleCts?.Cancel(); + _listenTask?.Wait(TimeSpan.FromSeconds(5)); + } + + public void Dispose() + { + Stop(); + _cts.Dispose(); + } + + /// + /// Block until the coordinator is stopped. + /// + public void WaitForShutdown() + { + try + { + _listenTask?.Wait(); + } + catch (AggregateException ex) when (ex.InnerException is OperationCanceledException) + { + } + } + + private void ListenLoop(CancellationToken ct) + { + while (!ct.IsCancellationRequested) + { + string pipeName = GetPipeName(); + + // Create a per-cycle CTS linked to the main one. + // The pipe watchdog can cancel just this cycle to force socket recreation. + _listenCycleCts = CancellationTokenSource.CreateLinkedTokenSource(ct); + var cycleToken = _listenCycleCts.Token; + + // Don't use 'using' — ownership transfers to the threadpool handler. +#pragma warning disable CA2000 // Dispose is called in the Task.Run finally block + var server = new NamedPipeServerStream( + pipeName, + PipeDirection.InOut, + NamedPipeServerStream.MaxAllowedServerInstances, + PipeTransmissionMode.Byte, + System.IO.Pipes.PipeOptions.CurrentUserOnly); +#pragma warning restore CA2000 + + try + { + server.WaitForConnectionAsync(cycleToken).Wait(cycleToken); + } + catch (OperationCanceledException) when (!ct.IsCancellationRequested) + { + server.Dispose(); + Console.WriteLine($"[{DateTime.Now:HH:mm:ss}] Pipe file missing, recreating listener..."); + continue; + } + catch (OperationCanceledException) + { + server.Dispose(); + break; + } + catch (AggregateException ex) when (ex.InnerException is OperationCanceledException && !ct.IsCancellationRequested) + { + server.Dispose(); + Console.WriteLine($"[{DateTime.Now:HH:mm:ss}] Pipe file missing, recreating listener..."); + continue; + } + catch (AggregateException ex) when (ex.InnerException is OperationCanceledException) + { + server.Dispose(); + break; + } + + // Handle on threadpool — immediately loop back to accept the next client. + // This minimizes the gap where no listener is bound. + var capturedServer = server; + Task.Run(() => + { + try + { + HandleConnection(capturedServer); + } + finally + { + capturedServer.Dispose(); + } + }, ct); + } + } + + /// + /// Periodic watchdog that detects if the coordinator pipe file was deleted + /// (e.g. by a cleanup script) and interrupts the listen loop so it recreates it. + /// + private void CheckPipeHealth(object? state) + { + string pipeName = GetPipeName(); + if (!File.Exists(pipeName)) + { + Console.WriteLine($"[{DateTime.Now:HH:mm:ss}] WARNING: Pipe file {pipeName} was deleted externally! Triggering recreation..."); + _listenCycleCts?.Cancel(); + } + } + + private void HandleConnection(NamedPipeServerStream server) + { + try + { + using var reader = new StreamReader(server, leaveOpen: true); + using var writer = new StreamWriter(server, leaveOpen: true) { AutoFlush = true }; + + string? line = reader.ReadLine(); + if (string.IsNullOrEmpty(line)) + { + return; + } + + string[] parts = line.Split(' '); + string command = parts[0].ToUpperInvariant(); + + switch (command) + { + case "REGISTER": + HandleRegister(parts, writer); + break; + case "HEARTBEAT": + HandleHeartbeat(parts, writer); + break; + case "UNREGISTER": + HandleUnregister(parts, writer); + break; + case "STATUS": + HandleStatus(writer); + break; + case "SHUTDOWN": + writer.WriteLine("OK"); + _cts.Cancel(); + break; + default: + writer.WriteLine("ERR unknown command"); + break; + } + } + catch (IOException) + { + // Client disconnected + } + } + + private void HandleRegister(string[] parts, StreamWriter writer) + { + if (parts.Length < 3) + { + writer.WriteLine("ERR usage: REGISTER buildId requestedNodes"); + return; + } + + string buildId = parts[1]; + if (!int.TryParse(parts[2], out int requested) || requested <= 0) + { + writer.WriteLine("ERR invalid requestedNodes"); + return; + } + + var registration = new BuildRegistration(buildId, requested, DateTime.UtcNow); + + // First build ever — activate immediately with full budget + if (_activeBuilds.IsEmpty) + { + _activeBuilds[buildId] = registration; + registration.AcknowledgedEpoch = _rebalanceEpoch; + int granted = CalculateBudget(buildId); + registration.GrantedNodes = granted; + + Console.WriteLine($"[{DateTime.Now:HH:mm:ss}] REGISTER {buildId}: requested={requested} granted={granted} (first build)"); + writer.WriteLine($"OK {granted}"); + return; + } + + // Subsequent builds — always queue. Bump epoch so active builds must + // heartbeat (acknowledge reduced budget) before this build is promoted. + lock (_queueLock) + { + _queuedBuilds.Add(registration); + _rebalanceEpoch++; + int position = _queuedBuilds.Count; + int totalQueued = _queuedBuilds.Count; + + Console.WriteLine($"[{DateTime.Now:HH:mm:ss}] QUEUED {buildId}: position={position}/{totalQueued} active={_activeBuilds.Count} epoch={_rebalanceEpoch} (waiting for heartbeat gate)"); + writer.WriteLine($"QUEUED {position} {totalQueued}"); + } + } + + private void HandleHeartbeat(string[] parts, StreamWriter writer) + { + if (parts.Length < 2) + { + writer.WriteLine("ERR usage: HEARTBEAT buildId"); + return; + } + + string buildId = parts[1]; + + // Check if build is active + if (_activeBuilds.TryGetValue(buildId, out var activeReg)) + { + activeReg.LastHeartbeat = DateTime.UtcNow; + activeReg.AcknowledgedEpoch = _rebalanceEpoch; + + int newBudget = CalculateBudget(buildId); + activeReg.GrantedNodes = newBudget; + writer.WriteLine($"OK {newBudget}"); + + // After acknowledging, check if all active builds are caught up + // and we can promote queued builds + TryPromotePending(); + return; + } + + // Check if build is queued + lock (_queueLock) + { + int index = _queuedBuilds.FindIndex(b => b.BuildId == buildId); + if (index >= 0) + { + var queuedReg = _queuedBuilds[index]; + queuedReg.LastHeartbeat = DateTime.UtcNow; + int position = index + 1; + int totalQueued = _queuedBuilds.Count; + int waitSec = (int)(DateTime.UtcNow - queuedReg.QueuedAt).TotalSeconds; + writer.WriteLine($"QUEUED {position} {totalQueued} {waitSec}"); + return; + } + } + + // Unknown build — return full budget (fallback) + writer.WriteLine($"OK {_totalBudget}"); + } + + private void HandleUnregister(string[] parts, StreamWriter writer) + { + if (parts.Length < 2) + { + writer.WriteLine("ERR usage: UNREGISTER buildId"); + return; + } + + string buildId = parts[1]; + + // Remove from active builds + bool wasActive = _activeBuilds.TryRemove(buildId, out _); + + // Also remove from queue in case it was queued + if (!wasActive) + { + lock (_queueLock) + { + _queuedBuilds.RemoveAll(b => b.BuildId == buildId); + } + } + + Console.WriteLine($"[{DateTime.Now:HH:mm:ss}] UNREGISTER {buildId}: active={_activeBuilds.Count} queued={_queuedBuilds.Count}"); + + // When a build leaves, remaining builds get MORE budget (safe direction). + // Promote immediately if there's a slot — the promoted build gets the correct + // share, and existing builds will learn their increased budget on next heartbeat. + string? promoted = null; + if (wasActive && _activeBuilds.Count < _maxConcurrentBuilds) + { + lock (_queueLock) + { + if (_queuedBuilds.Count > 0) + { + var next = _queuedBuilds[0]; + _queuedBuilds.RemoveAt(0); + next.PromotedAt = DateTime.UtcNow; + next.AcknowledgedEpoch = _rebalanceEpoch; + _activeBuilds[next.BuildId] = next; + int granted = CalculateBudget(next.BuildId); + next.GrantedNodes = granted; + promoted = next.BuildId; + + Console.WriteLine($"[{DateTime.Now:HH:mm:ss}] PROMOTED {next.BuildId}: granted={granted} waited={(next.PromotedAt.Value - next.QueuedAt):mm\\:ss} active={_activeBuilds.Count} queued={_queuedBuilds.Count}"); + + // If more queued, bump epoch for next round + if (_queuedBuilds.Count > 0) + { + _rebalanceEpoch++; + } + } + } + } + + if (promoted != null) + { + writer.WriteLine($"OK promoted {promoted}"); + } + else + { + writer.WriteLine("OK"); + } + + RebalanceAll(); + } + + /// + /// Promote queued builds if: + /// 1. There's capacity (active < max concurrent) + /// 2. All active builds have acknowledged the current rebalance epoch + /// (so they've received their reduced budget via heartbeat) + /// Promotes one build at a time, bumping epoch after each so the newly + /// promoted build must also heartbeat before the next one is promoted. + /// + private void TryPromotePending() + { + if (_activeBuilds.Count >= _maxConcurrentBuilds) + { + return; + } + + // Check that ALL active builds have acknowledged the current epoch + int currentEpoch = _rebalanceEpoch; + foreach (var kvp in _activeBuilds) + { + if (kvp.Value.AcknowledgedEpoch < currentEpoch) + { + return; // Not all caught up yet + } + } + + lock (_queueLock) + { + if (_queuedBuilds.Count == 0) + { + return; + } + + // Promote one build + var next = _queuedBuilds[0]; + _queuedBuilds.RemoveAt(0); + + next.PromotedAt = DateTime.UtcNow; + next.AcknowledgedEpoch = currentEpoch; // It starts caught up + _activeBuilds[next.BuildId] = next; + + int granted = CalculateBudget(next.BuildId); + next.GrantedNodes = granted; + + Console.WriteLine($"[{DateTime.Now:HH:mm:ss}] PROMOTED {next.BuildId}: granted={granted} waited={(next.PromotedAt.Value - next.QueuedAt):mm\\:ss} active={_activeBuilds.Count} queued={_queuedBuilds.Count}"); + + // If more queued, bump epoch — existing active builds must heartbeat + // their new (further-reduced) budget before the next promotion + if (_queuedBuilds.Count > 0) + { + _rebalanceEpoch++; + Console.WriteLine($"[{DateTime.Now:HH:mm:ss}] Epoch bumped to {_rebalanceEpoch} — {_queuedBuilds.Count} still queued"); + } + } + + RebalanceAll(); + } + + private void HandleStatus(StreamWriter writer) + { + int queueCount; + lock (_queueLock) { queueCount = _queuedBuilds.Count; } + + writer.WriteLine($"OK budget={_totalBudget} active={_activeBuilds.Count} queued={queueCount} max={_maxConcurrentBuilds} epoch={_rebalanceEpoch}"); + + if (!_activeBuilds.IsEmpty) + { + writer.WriteLine("Active:"); + foreach (var kvp in _activeBuilds) + { + var reg = kvp.Value; + string ack = reg.AcknowledgedEpoch >= _rebalanceEpoch ? "yes" : "no"; + writer.WriteLine($" {reg.BuildId}: granted={reg.GrantedNodes} requested={reg.RequestedNodes} epoch_ack={ack} age={DateTime.UtcNow - reg.RegisteredAt:mm\\:ss}"); + } + } + + lock (_queueLock) + { + if (_queuedBuilds.Count > 0) + { + writer.WriteLine("Queued:"); + for (int i = 0; i < _queuedBuilds.Count; i++) + { + var reg = _queuedBuilds[i]; + int waitSec = (int)(DateTime.UtcNow - reg.QueuedAt).TotalSeconds; + writer.WriteLine($" #{i + 1} {reg.BuildId}: requested={reg.RequestedNodes} waiting={waitSec}s"); + } + } + } + } + + private int CalculateBudget(string buildId) + { + int activeCount = _activeBuilds.Count; + if (activeCount == 0) + { + return _totalBudget; + } + + // Account for queued builds that will be promoted soon. + // This way active builds pre-shrink to make room. + int pendingCount; + lock (_queueLock) + { + pendingCount = Math.Min(_queuedBuilds.Count, _maxConcurrentBuilds - activeCount); + pendingCount = Math.Max(0, pendingCount); + } + + int totalBuilds = activeCount + pendingCount; + int fairShare = Math.Max(1, _totalBudget / totalBuilds); + + // But don't exceed what the build originally requested + if (_activeBuilds.TryGetValue(buildId, out var registration)) + { + return Math.Min(fairShare, registration.RequestedNodes); + } + + return fairShare; + } + + private void RebalanceAll() + { + foreach (var kvp in _activeBuilds) + { + kvp.Value.GrantedNodes = CalculateBudget(kvp.Key); + } + + if (!_activeBuilds.IsEmpty) + { + var summary = string.Join(", ", _activeBuilds.Select(b => $"{b.Key}={b.Value.GrantedNodes}")); + Console.WriteLine($"[{DateTime.Now:HH:mm:ss}] Rebalanced: {summary}"); + } + } + + /// + /// Periodic timer callback that removes builds whose process has exited. + /// Only removes if heartbeat is stale AND the PID is no longer running. + /// + private void ReapStaleBuilds(object? state) + { + var now = DateTime.UtcNow; + bool anyReaped = false; + + // Check active builds + foreach (var kvp in _activeBuilds) + { + var reg = kvp.Value; + double staleSec = (now - reg.LastHeartbeat).TotalSeconds; + + if (staleSec < StaleHeartbeatSeconds) + { + continue; // Recent heartbeat, still healthy + } + + // Heartbeat is stale — check if the process is actually dead + if (IsProcessAlive(kvp.Key)) + { + continue; // Process still running, just slow to heartbeat + } + + // Process is dead — reap it + if (_activeBuilds.TryRemove(kvp.Key, out _)) + { + Console.WriteLine($"[{DateTime.Now:HH:mm:ss}] REAPED {kvp.Key}: process dead, stale {staleSec:F0}s"); + anyReaped = true; + } + } + + // Check queued builds too + lock (_queueLock) + { + for (int i = _queuedBuilds.Count - 1; i >= 0; i--) + { + var reg = _queuedBuilds[i]; + double staleSec = (now - reg.LastHeartbeat).TotalSeconds; + + if (staleSec < StaleHeartbeatSeconds) + { + continue; + } + + if (IsProcessAlive(reg.BuildId)) + { + continue; + } + + Console.WriteLine($"[{DateTime.Now:HH:mm:ss}] REAPED (queued) {reg.BuildId}: process dead, stale {staleSec:F0}s"); + _queuedBuilds.RemoveAt(i); + anyReaped = true; + } + } + + if (anyReaped) + { + // Try to promote queued builds into newly opened slots + TryPromotePending(); + RebalanceAll(); + } + } + + /// + /// Extract PID from build ID (format: "{PID}-{ticks}") and check if the process is alive. + /// + private static bool IsProcessAlive(string buildId) + { + int dashIndex = buildId.IndexOf('-'); + if (dashIndex <= 0) + { + return false; // Can't parse — assume dead + } + + if (!int.TryParse(buildId.AsSpan(0, dashIndex), out int pid)) + { + return false; + } + + try + { + var process = Process.GetProcessById(pid); + return !process.HasExited; + } + catch (ArgumentException) + { + // Process doesn't exist + return false; + } + catch (InvalidOperationException) + { + return false; + } + } + + private sealed class BuildRegistration + { + internal string BuildId { get; } + internal int RequestedNodes { get; } + internal int GrantedNodes { get; set; } + internal DateTime RegisteredAt { get; } + internal DateTime QueuedAt { get; } + internal DateTime? PromotedAt { get; set; } + internal DateTime LastHeartbeat { get; set; } + internal int AcknowledgedEpoch { get; set; } + + internal BuildRegistration(string buildId, int requestedNodes, DateTime registeredAt) + { + BuildId = buildId; + RequestedNodes = requestedNodes; + GrantedNodes = requestedNodes; + RegisteredAt = registeredAt; + QueuedAt = registeredAt; + LastHeartbeat = registeredAt; + AcknowledgedEpoch = -1; // Not yet acknowledged + } + } + } +} diff --git a/src/Build/BackEnd/BuildCoordinator/BuildCoordinatorClient.cs b/src/Build/BackEnd/BuildCoordinator/BuildCoordinatorClient.cs new file mode 100644 index 00000000000..aad7333d6c1 --- /dev/null +++ b/src/Build/BackEnd/BuildCoordinator/BuildCoordinatorClient.cs @@ -0,0 +1,217 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.IO; +using System.IO.Pipes; +using System.Threading; + +namespace Microsoft.Build.BackEnd +{ + /// + /// Client used by BuildManager to communicate with an external BuildCoordinator process. + /// If no coordinator is running, all operations gracefully no-op and the build runs with + /// its original MaxNodeCount. + /// + /// When the coordinator is at capacity, TryRegister will block (heartbeating for position + /// updates) until the build is promoted to active. + /// + internal sealed class BuildCoordinatorClient : IDisposable + { + private readonly string _buildId; + private readonly string _pipeName; + private int _grantedNodes; + private Timer? _heartbeatTimer; + private bool _registered; + private Action? _onBudgetChanged; + + internal bool IsConnected => _registered; + internal int GrantedNodes => _grantedNodes; + internal string BuildId => _buildId; + + internal BuildCoordinatorClient() + { + _buildId = $"{Environment.ProcessId}-{DateTime.UtcNow.Ticks}"; + _pipeName = BuildCoordinator.GetPipeName(); + } + + /// + /// Try to register with the coordinator. Returns true if a coordinator was found + /// and the build was registered (or promoted from queue). + /// + /// If the coordinator has capacity, returns immediately with grantedNodes. + /// If queued, blocks and heartbeats until promoted, calling onQueuePositionChanged + /// with (position, totalQueued, waitSeconds) on each update. + /// + internal bool TryRegister(int requestedNodes, out int grantedNodes, Action? onQueuePositionChanged = null, CancellationToken ct = default) + { + grantedNodes = requestedNodes; + + string? response = SendCommand($"REGISTER {_buildId} {requestedNodes}"); + if (response == null) + { + return false; + } + + // Immediate grant + if (response.StartsWith("OK ", StringComparison.Ordinal)) + { + if (int.TryParse(response.AsSpan(3), out int granted) && granted > 0) + { + _grantedNodes = granted; + grantedNodes = granted; + _registered = true; + return true; + } + + return false; + } + + // Queued — block and heartbeat until promoted + if (response.StartsWith("QUEUED ", StringComparison.Ordinal)) + { + return WaitInQueue(requestedNodes, out grantedNodes, onQueuePositionChanged, ct); + } + + return false; + } + + /// + /// Block heartbeating until the coordinator promotes this build to active. + /// + private bool WaitInQueue(int requestedNodes, out int grantedNodes, Action? onQueuePositionChanged, CancellationToken ct) + { + grantedNodes = requestedNodes; + + while (!ct.IsCancellationRequested) + { + Thread.Sleep(2000); // Heartbeat interval + + string? hbResponse = SendCommand($"HEARTBEAT {_buildId}"); + if (hbResponse == null) + { + // Coordinator gone — fall through with original node count + return false; + } + + // Promoted to active! + if (hbResponse.StartsWith("OK ", StringComparison.Ordinal)) + { + if (int.TryParse(hbResponse.AsSpan(3), out int granted) && granted > 0) + { + _grantedNodes = granted; + grantedNodes = granted; + _registered = true; + return true; + } + + return false; + } + + // Still queued — parse position info: "QUEUED position totalQueued waitSec" + if (hbResponse.StartsWith("QUEUED ", StringComparison.Ordinal)) + { + string[] parts = hbResponse.Split(' '); + if (parts.Length >= 4 + && int.TryParse(parts[1], out int position) + && int.TryParse(parts[2], out int totalQueued) + && int.TryParse(parts[3], out int waitSec)) + { + onQueuePositionChanged?.Invoke(position, totalQueued, waitSec); + } + } + } + + // Cancelled — unregister + SendCommand($"UNREGISTER {_buildId}"); + return false; + } + + /// + /// Start periodic heartbeats that update the node budget. + /// When the coordinator rebalances (e.g., another build starts or stops), + /// the callback fires with the new budget. + /// + internal void StartHeartbeat(Action onBudgetChanged) + { + _onBudgetChanged = onBudgetChanged; + _heartbeatTimer = new Timer(HeartbeatCallback, null, TimeSpan.FromSeconds(2), TimeSpan.FromSeconds(2)); + } + + /// + /// Unregister from the coordinator and stop heartbeats. + /// + internal void Unregister() + { + _heartbeatTimer?.Dispose(); + _heartbeatTimer = null; + + if (_registered) + { + SendCommand($"UNREGISTER {_buildId}"); + _registered = false; + } + } + + public void Dispose() + { + Unregister(); + } + + private void HeartbeatCallback(object? state) + { + if (!_registered) + { + return; + } + + string? response = SendCommand($"HEARTBEAT {_buildId}"); + if (response != null && response.StartsWith("OK ", StringComparison.Ordinal)) + { + if (int.TryParse(response.AsSpan(3), out int newBudget) && newBudget > 0 && newBudget != _grantedNodes) + { + _grantedNodes = newBudget; + _onBudgetChanged?.Invoke(newBudget); + } + } + } + + /// + /// Send a command to the coordinator and return the response line, or null if connection fails. + /// Retries up to 3 times with 300ms delay to handle the brief gap between listener re-binds. + /// + private string? SendCommand(string command) + { + for (int attempt = 0; attempt < 3; attempt++) + { + try + { + using var client = new NamedPipeClientStream(".", _pipeName, PipeDirection.InOut, System.IO.Pipes.PipeOptions.CurrentUserOnly); + client.Connect(1000); + + using var writer = new StreamWriter(client, leaveOpen: true) { AutoFlush = true }; + using var reader = new StreamReader(client, leaveOpen: true); + + writer.WriteLine(command); + return reader.ReadLine(); + } + catch (TimeoutException) + { + if (attempt < 2) + { + Thread.Sleep(300); + } + } + catch (IOException) + { + if (attempt < 2) + { + Thread.Sleep(300); + } + } + } + + return null; + } + } +} diff --git a/src/Build/BackEnd/BuildManager/BuildManager.cs b/src/Build/BackEnd/BuildManager/BuildManager.cs index 9656669047a..f1cde74e437 100644 --- a/src/Build/BackEnd/BuildManager/BuildManager.cs +++ b/src/Build/BackEnd/BuildManager/BuildManager.cs @@ -267,6 +267,14 @@ public class BuildManager : INodePacketHandler, IBuildComponentHost, IDisposable private bool _hasProjectCacheServiceInitializedVsScenario; +#if NET + /// + /// Client for communicating with an external build coordinator process. + /// If a coordinator is running, it dynamically adjusts MaxNodeCount. + /// + private BuildCoordinatorClient? _coordinatorClient; +#endif + #if DEBUG /// /// true to wait for a debugger to be attached, otherwise false. @@ -687,6 +695,12 @@ public void BeginBuild(BuildParameters parameters) _noNodesActiveEvent!.Set(); } + // Try to register with external build coordinator (if running). + // This may block if the coordinator queues this build. +#if NET + TryRegisterWithCoordinator(); +#endif + ILoggingService InitializeLoggingService() { ILoggingService loggingService = CreateLoggingService( @@ -1026,6 +1040,12 @@ public void EndBuild() var exceptionsThrownInEndBuild = false; + // Unregister from coordinator early so other builds can scale up +#if NET + _coordinatorClient?.Dispose(); + _coordinatorClient = null; +#endif + try { lock (_syncLock) @@ -2270,7 +2290,16 @@ private void ShutdownConnectedNodes(bool abort) _executionCancellationTokenSource?.Cancel(); // If we are aborting, we will NOT reuse the nodes because their state may be compromised by attempts to shut down while the build is in-progress. - _nodeManager?.ShutdownConnectedNodes(!abort && _buildParameters!.EnableNodeReuse); + // When a coordinator is managing builds, disable reuse so nodes exit immediately + // instead of lingering for 15 minutes — the coordinator handles cross-build lifecycle. +#if NET + bool coordinatorActive = _coordinatorClient != null || (Environment.GetEnvironmentVariable("MSBUILD_COORDINATOR_DISABLE") != "1" + && System.IO.File.Exists(BuildCoordinator.GetPipeName())); + bool enableReuse = !abort && _buildParameters!.EnableNodeReuse && !coordinatorActive; +#else + bool enableReuse = !abort && _buildParameters!.EnableNodeReuse; +#endif + _nodeManager?.ShutdownConnectedNodes(enableReuse); // if we are aborting, the task host will hear about it in time through the task building infrastructure; // so only shut down the task host nodes if we're shutting down tidily (in which case, it is assumed that all @@ -2323,6 +2352,74 @@ private void VerifyStateInternal(BuildManagerState requiredState) /// /// Method called to reset the state of the system after a build. /// + /// + /// Attempts to register this build with an external build coordinator process. + /// If a coordinator is running, it may adjust MaxNodeCount or queue the build. + /// If no coordinator is running, does nothing and the build proceeds normally. + /// +#if NET + private void TryRegisterWithCoordinator() + { + if (_buildParameters == null) + { + return; + } + + // Allow opt-out via environment variable + if (Environment.GetEnvironmentVariable("MSBUILD_COORDINATOR_DISABLE") == "1") + { + return; + } + + var client = new BuildCoordinatorClient(); + int requestedNodes = _buildParameters.MaxNodeCount; + + bool registered = client.TryRegister( + requestedNodes, + out int grantedNodes, + onQueuePositionChanged: (position, total, waitSec) => + { + // Log queue position updates to console + Console.Error.WriteLine($" [coordinator] Queued: position {position}/{total}, waiting {waitSec}s"); + }, + ct: _executionCancellationTokenSource?.Token ?? CancellationToken.None); + + if (registered) + { + _coordinatorClient = client; + + if (grantedNodes != requestedNodes) + { + _buildParameters.MaxNodeCount = grantedNodes; + Console.Error.WriteLine($" [coordinator] Node budget: {grantedNodes} (requested {requestedNodes})"); + } + + // Start heartbeats — coordinator may adjust budget dynamically + client.StartHeartbeat(newBudget => + { + if (_buildParameters != null && newBudget != _buildParameters.MaxNodeCount) + { + int old = _buildParameters.MaxNodeCount; + _buildParameters.MaxNodeCount = newBudget; + + // If budget decreased, shut down excess worker nodes immediately + if (newBudget < old) + { + _nodeManager?.ShutdownExcessNodes(newBudget); + } + + Console.Error.WriteLine($" [coordinator] Budget changed: {old} → {newBudget}"); + } + }); + } + else + { + // No coordinator found — dispose client and run normally + client.Dispose(); + } + } +#endif + private void Reset() { _nodeManager?.UnregisterPacketHandler(NodePacketType.BuildRequestBlocker); diff --git a/src/Build/BackEnd/Components/Communications/INodeManager.cs b/src/Build/BackEnd/Components/Communications/INodeManager.cs index f5b79fa0ba7..b951c240106 100644 --- a/src/Build/BackEnd/Components/Communications/INodeManager.cs +++ b/src/Build/BackEnd/Components/Communications/INodeManager.cs @@ -54,6 +54,11 @@ internal interface INodeManager : IBuildComponent, void ClearPerBuildState(); IEnumerable GetProcesses(); + + /// + /// Shuts down out-of-proc nodes beyond the specified limit. + /// + void ShutdownExcessNodes(int maxNodesToKeep); #endregion } } diff --git a/src/Build/BackEnd/Components/Communications/NodeEndpointOutOfProc.cs b/src/Build/BackEnd/Components/Communications/NodeEndpointOutOfProc.cs index 437d741d41b..da46d0fb26f 100644 --- a/src/Build/BackEnd/Components/Communications/NodeEndpointOutOfProc.cs +++ b/src/Build/BackEnd/Components/Communications/NodeEndpointOutOfProc.cs @@ -26,7 +26,13 @@ internal NodeEndpointOutOfProc(bool enableReuse, bool lowPriority) _enableReuse = enableReuse; LowPriority = lowPriority; - InternalConstruct(); + // Use hash-based pipe name for fast discovery on Unix. + // Format: MSBuild-{hash}-{pid} — allows schedulers to find compatible nodes + // by listing /tmp/MSBuild-{hash}-* instead of probing all dotnet processes. + string? pipeName = NativeMethodsShared.IsUnixLike + ? NamedPipeUtil.GetHashBasedPipeName(GetHandshake().ComputeHash()) + : null; // Windows: keep legacy MSBuild{PID} naming + InternalConstruct(pipeName); } /// diff --git a/src/Build/BackEnd/Components/Communications/NodeManager.cs b/src/Build/BackEnd/Components/Communications/NodeManager.cs index fb83ceaf5c1..1193a895794 100644 --- a/src/Build/BackEnd/Components/Communications/NodeManager.cs +++ b/src/Build/BackEnd/Components/Communications/NodeManager.cs @@ -361,5 +361,13 @@ public IEnumerable GetProcesses() { return _outOfProcNodeProvider?.GetProcesses()!; } + + public void ShutdownExcessNodes(int maxNodesToKeep) + { + if (_outOfProcNodeProvider is NodeProviderOutOfProc provider) + { + provider.ShutdownExcessNodes(maxNodesToKeep); + } + } } } diff --git a/src/Build/BackEnd/Components/Communications/NodeProviderOutOfProc.cs b/src/Build/BackEnd/Components/Communications/NodeProviderOutOfProc.cs index 02e998cfa1e..7cb32d770d2 100644 --- a/src/Build/BackEnd/Components/Communications/NodeProviderOutOfProc.cs +++ b/src/Build/BackEnd/Components/Communications/NodeProviderOutOfProc.cs @@ -152,6 +152,26 @@ public void ShutdownConnectedNodes(bool enableReuse) ShutdownConnectedNodes(contextsToShutDown, enableReuse); } + /// + /// Shuts down connected nodes beyond the specified limit, keeping only nodes. + /// Excess nodes are sent NodeBuildComplete with enableReuse=true so they prepare for reuse by a future build. + /// + public void ShutdownExcessNodes(int maxNodesToKeep) + { + if (_nodeContexts.Count <= maxNodesToKeep) + { + return; + } + + int toShutdown = _nodeContexts.Count - maxNodesToKeep; + var excessContexts = _nodeContexts.Values.Take(toShutdown).ToList(); + + foreach (var context in excessContexts) + { + context.SendData(new NodeBuildComplete(true /* prepare for reuse */)); + } + } + /// /// Shuts down all of the managed nodes permanently. /// diff --git a/src/Build/BackEnd/Components/Communications/NodeProviderOutOfProcBase.cs b/src/Build/BackEnd/Components/Communications/NodeProviderOutOfProcBase.cs index 9f701208dde..25349a97a4a 100644 --- a/src/Build/BackEnd/Components/Communications/NodeProviderOutOfProcBase.cs +++ b/src/Build/BackEnd/Components/Communications/NodeProviderOutOfProcBase.cs @@ -49,6 +49,12 @@ internal abstract partial class NodeProviderOutOfProcBase /// private const int TimeoutForNewNodeCreation = 30000; + /// + /// The amount of time to wait when attempting to reuse an existing idle node. + /// Must be long enough for a sleeping node to wake and respond to the handshake. + /// + private const int TimeoutForNodeReuse = 1000; + /// /// The amount of time to wait for an out-of-proc node to exit. /// @@ -255,7 +261,34 @@ protected IList GetNodes( if (nodeReuseRequested) { IList possibleRunningNodesList; - (expectedProcessName, possibleRunningNodesList) = GetPossibleRunningNodes(msbuildLocation, expectedNodeMode); + + // On Unix, use hash-based pipe file listing for O(1) discovery of compatible nodes + // instead of enumerating all dotnet processes and probing each one. + if (NativeMethodsShared.IsUnixLike) + { + string handshakeHash = nodeLaunchData.Handshake.ComputeHash(); + IList pids = NamedPipeUtil.FindNodesByHandshakeHash(handshakeHash); + var processes = new List(pids.Count); + foreach (int pid in pids) + { + try + { + processes.Add(Process.GetProcessById(pid)); + } + catch + { + // Process may have exited between pipe file listing and this call. + } + } + + expectedProcessName = "dotnet"; + possibleRunningNodesList = processes; + } + else + { + (expectedProcessName, possibleRunningNodesList) = GetPossibleRunningNodes(msbuildLocation, expectedNodeMode); + } + possibleRunningNodes = new ConcurrentQueue(possibleRunningNodesList); if (possibleRunningNodesList.Count > 0) @@ -318,7 +351,7 @@ bool TryReuseAnyFromPossibleRunningNodes(int currentProcessId, int nodeId) _processesToIgnore.TryAdd(nodeLookupKey, default); // Attempt to connect to each process in turn. - Stream nodeStream = TryConnectToProcess(nodeToReuse.Id, 0 /* poll, don't wait for connections */, nodeLaunchData.Handshake, out HandshakeResult result); + Stream nodeStream = TryConnectToProcess(nodeToReuse.Id, TimeoutForNodeReuse, nodeLaunchData.Handshake, out HandshakeResult result); if (nodeStream != null) { // Connection successful, use this node. @@ -719,8 +752,11 @@ private static void ValidateRemotePipeSecurityOnWindows(NamedPipeClientStream no /// private Stream TryConnectToProcess(int nodeProcessId, int timeout, Handshake handshake, out HandshakeResult result) { - // Try and connect to the process. - string pipeName = NamedPipeUtil.GetPlatformSpecificPipeName(nodeProcessId); + // On Unix, nodes create pipes with hash-based names for fast discovery. + // On Windows, keep legacy MSBuild{PID} naming. + string pipeName = NativeMethodsShared.IsUnixLike + ? NamedPipeUtil.GetHashBasedPipeName(handshake.ComputeHash(), nodeProcessId) + : NamedPipeUtil.GetPlatformSpecificPipeName(nodeProcessId); #pragma warning disable SA1111, SA1009 // Closing parenthesis should be on line of last parameter NamedPipeClientStream nodeStream = new NamedPipeClientStream( diff --git a/src/Build/BackEnd/Components/Communications/TaskHostNodeManager.cs b/src/Build/BackEnd/Components/Communications/TaskHostNodeManager.cs index 66c881052b8..5aec9f9ca4a 100644 --- a/src/Build/BackEnd/Components/Communications/TaskHostNodeManager.cs +++ b/src/Build/BackEnd/Components/Communications/TaskHostNodeManager.cs @@ -184,5 +184,10 @@ IEnumerable INodeManager.GetProcesses() { return _outOfProcTaskHostNodeProvider.GetProcesses(); } + + void INodeManager.ShutdownExcessNodes(int maxNodesToKeep) + { + // Task host nodes are short-lived; no excess to trim. + } } } diff --git a/src/Build/Microsoft.Build.csproj b/src/Build/Microsoft.Build.csproj index 5343eabdc97..9a9ff2f8d26 100644 --- a/src/Build/Microsoft.Build.csproj +++ b/src/Build/Microsoft.Build.csproj @@ -169,6 +169,8 @@ + + diff --git a/src/MSBuild/XMake.cs b/src/MSBuild/XMake.cs index 089e93d533a..029d343a8af 100644 --- a/src/MSBuild/XMake.cs +++ b/src/MSBuild/XMake.cs @@ -310,7 +310,17 @@ public static int Main(string[] args) } int exitCode; + + // Check for coordinator mode: `dotnet msbuild --coordinator [--budget N] [--max-builds N]` +#if NET + if (IsCoordinatorMode(args, out int budget, out int maxBuilds)) + { + exitCode = RunCoordinator(budget, maxBuilds); + } + else if ( +#else if ( +#endif Environment.GetEnvironmentVariable(Traits.UseMSBuildServerEnvVarName) == "1" && !Traits.Instance.EscapeHatches.EnsureStdOutForChildNodesIsPrimaryStdout && CanRunServerBasedOnCommandLineSwitches(args)) @@ -337,6 +347,63 @@ public static int Main(string[] args) return exitCode; } + /// + /// Check if the command line requests coordinator mode. + /// Usage: msbuild --coordinator [--budget N] [--max-builds N] + /// +#if NET + private static bool IsCoordinatorMode(string[] args, out int budget, out int maxBuilds) + { + budget = 0; + maxBuilds = 2; + + bool found = false; + for (int i = 0; i < args.Length; i++) + { + if (args[i].Equals("--coordinator", StringComparison.OrdinalIgnoreCase)) + { + found = true; + } + else if (args[i].Equals("--budget", StringComparison.OrdinalIgnoreCase) && i + 1 < args.Length) + { + int.TryParse(args[i + 1], out budget); + i++; + } + else if (args[i].Equals("--max-builds", StringComparison.OrdinalIgnoreCase) && i + 1 < args.Length) + { + int.TryParse(args[i + 1], out maxBuilds); + i++; + } + } + + if (found && budget <= 0) + { + // Default budget: 80% of logical processors + budget = Math.Max(1, (int)(Environment.ProcessorCount * 0.8)); + } + + return found; + } + + /// + /// Run the build coordinator as a long-lived process. + /// + private static int RunCoordinator(int budget, int maxBuilds) + { + using var coordinator = new Microsoft.Build.BackEnd.BuildCoordinator(budget, maxBuilds); + coordinator.Start(); + + Console.CancelKeyPress += (_, e) => + { + e.Cancel = true; + coordinator.Stop(); + }; + + coordinator.WaitForShutdown(); + return 0; + } +#endif + /// /// Returns true if arguments allows or make sense to leverage msbuild server. /// diff --git a/src/Shared/CommunicationsUtilities.cs b/src/Shared/CommunicationsUtilities.cs index 8c4f6a50167..a35b8206d43 100644 --- a/src/Shared/CommunicationsUtilities.cs +++ b/src/Shared/CommunicationsUtilities.cs @@ -289,8 +289,14 @@ protected Handshake(HandshakeOptions nodeType, bool includeSessionId, string too int sessionId = 0; if (includeSessionId) { - using var currentProcess = Process.GetCurrentProcess(); - sessionId = currentProcess.SessionId; + if (NativeMethodsShared.IsWindows) + { + using var currentProcess = Process.GetCurrentProcess(); + sessionId = currentProcess.SessionId; + } + // On Unix, getsid() returns the session leader PID which differs per terminal, + // preventing cross-terminal node reuse. Use 0 since Unix doesn't need + // RDP-style session isolation. } _handshakeComponents = IsNetTaskHost @@ -340,35 +346,15 @@ private static HandshakeComponents CreateStandardComponents(int options, int sal public virtual string GetKey() => $"{_handshakeComponents.Options} {_handshakeComponents.Salt} {_handshakeComponents.FileVersionMajor} {_handshakeComponents.FileVersionMinor} {_handshakeComponents.FileVersionBuild} {_handshakeComponents.FileVersionPrivate} {_handshakeComponents.SessionId}".ToString(CultureInfo.InvariantCulture); public virtual byte? ExpectedVersionInFirstByte => CommunicationsUtilities.handshakeVersion; - } - internal sealed class ServerNodeHandshake : Handshake - { /// /// Caching computed hash. /// private string _computedHash = null; - public override byte? ExpectedVersionInFirstByte => null; - - internal ServerNodeHandshake(HandshakeOptions nodeType) - : base(nodeType, includeSessionId: false, toolsDirectory: null) - { - } - - public override HandshakeComponents RetrieveHandshakeComponents() => new HandshakeComponents( - CommunicationsUtilities.AvoidEndOfHandshakeSignal(_handshakeComponents.Options), - CommunicationsUtilities.AvoidEndOfHandshakeSignal(_handshakeComponents.Salt), - CommunicationsUtilities.AvoidEndOfHandshakeSignal(_handshakeComponents.FileVersionMajor), - CommunicationsUtilities.AvoidEndOfHandshakeSignal(_handshakeComponents.FileVersionMinor), - CommunicationsUtilities.AvoidEndOfHandshakeSignal(_handshakeComponents.FileVersionBuild), - CommunicationsUtilities.AvoidEndOfHandshakeSignal(_handshakeComponents.FileVersionPrivate)); - - public override string GetKey() => $"{_handshakeComponents.Options} {_handshakeComponents.Salt} {_handshakeComponents.FileVersionMajor} {_handshakeComponents.FileVersionMinor} {_handshakeComponents.FileVersionBuild} {_handshakeComponents.FileVersionPrivate}" - .ToString(CultureInfo.InvariantCulture); - /// /// Computes Handshake stable hash string representing whole state of handshake. + /// Used for hash-based pipe naming to enable fast node discovery without trial-and-error probing. /// public string ComputeHash() { @@ -391,6 +377,27 @@ public string ComputeHash() } } + internal sealed class ServerNodeHandshake : Handshake + { + public override byte? ExpectedVersionInFirstByte => null; + + internal ServerNodeHandshake(HandshakeOptions nodeType) + : base(nodeType, includeSessionId: false, toolsDirectory: null) + { + } + + public override HandshakeComponents RetrieveHandshakeComponents() => new HandshakeComponents( + CommunicationsUtilities.AvoidEndOfHandshakeSignal(_handshakeComponents.Options), + CommunicationsUtilities.AvoidEndOfHandshakeSignal(_handshakeComponents.Salt), + CommunicationsUtilities.AvoidEndOfHandshakeSignal(_handshakeComponents.FileVersionMajor), + CommunicationsUtilities.AvoidEndOfHandshakeSignal(_handshakeComponents.FileVersionMinor), + CommunicationsUtilities.AvoidEndOfHandshakeSignal(_handshakeComponents.FileVersionBuild), + CommunicationsUtilities.AvoidEndOfHandshakeSignal(_handshakeComponents.FileVersionPrivate)); + + public override string GetKey() => $"{_handshakeComponents.Options} {_handshakeComponents.Salt} {_handshakeComponents.FileVersionMajor} {_handshakeComponents.FileVersionMinor} {_handshakeComponents.FileVersionBuild} {_handshakeComponents.FileVersionPrivate}" + .ToString(CultureInfo.InvariantCulture); + } + /// /// This class contains utility methods for the MSBuild engine. /// diff --git a/src/Shared/NamedPipeUtil.cs b/src/Shared/NamedPipeUtil.cs index 0b85b05bacd..b0d43e70ee4 100644 --- a/src/Shared/NamedPipeUtil.cs +++ b/src/Shared/NamedPipeUtil.cs @@ -1,6 +1,7 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. +using System.Collections.Generic; using System.IO; using Microsoft.Build.Internal; @@ -20,6 +21,56 @@ internal static string GetPlatformSpecificPipeName(int? processId = null) return GetPlatformSpecificPipeName(pipeName); } + /// + /// Returns a pipe name that encodes both the handshake hash and the process ID. + /// Format: MSBuild-{hash}-{pid} + /// This allows discovery of compatible nodes by listing pipes matching the hash prefix, + /// eliminating trial-and-error probing of all dotnet processes. + /// + internal static string GetHashBasedPipeName(string handshakeHash, int? processId = null) + { + processId ??= EnvironmentUtilities.CurrentProcessId; + string pipeName = $"MSBuild-{handshakeHash}-{processId}"; + return GetPlatformSpecificPipeName(pipeName); + } + + /// + /// Finds pipe files matching a handshake hash and extracts their PIDs. + /// Only works on Unix where pipes are files in /tmp. + /// + internal static IList FindNodesByHandshakeHash(string handshakeHash) + { + var pids = new List(); + // GetPlatformSpecificPipeName returns full paths like /tmp/MSBuild-{hash}-{pid} + // on Unix, and .NET does NOT add CoreFxPipe_ prefix for absolute paths. + string prefix = $"MSBuild-{handshakeHash}-"; + string? pipeDir = NativeMethodsShared.IsUnixLike ? "/tmp" : null; + + if (pipeDir == null) + { + // On Windows, named pipes aren't files — fall back to legacy discovery. + return pids; + } + + try + { + foreach (string file in System.IO.Directory.GetFiles(pipeDir, $"MSBuild-{handshakeHash}-*")) + { + string fileName = Path.GetFileName(file); + if (fileName.StartsWith(prefix) && int.TryParse(fileName.Substring(prefix.Length), out int pid)) + { + pids.Add(pid); + } + } + } + catch + { + // Directory enumeration can fail; fall back to legacy. + } + + return pids; + } + internal static string GetPlatformSpecificPipeName(string pipeName) { if (NativeMethodsShared.IsUnixLike) diff --git a/src/Shared/NodeEndpointOutOfProcBase.cs b/src/Shared/NodeEndpointOutOfProcBase.cs index ae422e25e31..345b7317895 100644 --- a/src/Shared/NodeEndpointOutOfProcBase.cs +++ b/src/Shared/NodeEndpointOutOfProcBase.cs @@ -44,8 +44,10 @@ internal abstract class NodeEndpointOutOfProcBase : INodeEndpoint #if NETCOREAPP2_1_OR_GREATER /// /// The amount of time to wait for the client to connect to the host. + /// Reduced from 60s to 5s so that failed reuse probes don't block idle nodes + /// from reaching their connection timeout check. /// - private const int ClientConnectTimeout = 60000; + private const int ClientConnectTimeout = 5000; #endif // NETCOREAPP2_1 /// From 0452e707e00c2130d1f7d1212b2c7f3351f892b9 Mon Sep 17 00:00:00 2001 From: JakeRadMSFT Date: Sun, 8 Mar 2026 18:01:55 -0700 Subject: [PATCH 2/2] Add unit tests + reduce node idle timeout to 30s - BuildCoordinator_Tests.cs: 29 tests covering coordinator protocol, fair-share budgeting, client retry logic, concurrent connections, staleness reaper, and cleanup behavior - HashBasedPipeNaming_Tests.cs: 14 tests covering ComputeHash determinism/caching, GetHashBasedPipeName format, FindNodesByHandshakeHash file matching, and SessionId=0 fix on Unix - Reduce DefaultNodeConnectionTimeout from 15 minutes to 30 seconds so idle nodes clean up promptly instead of lingering --- .../BackEnd/BuildCoordinator_Tests.cs | 884 ++++++++++++++++++ .../BackEnd/HashBasedPipeNaming_Tests.cs | 245 +++++ src/Shared/CommunicationsUtilities.cs | 2 +- 3 files changed, 1130 insertions(+), 1 deletion(-) create mode 100644 src/Build.UnitTests/BackEnd/BuildCoordinator_Tests.cs create mode 100644 src/Build.UnitTests/BackEnd/HashBasedPipeNaming_Tests.cs diff --git a/src/Build.UnitTests/BackEnd/BuildCoordinator_Tests.cs b/src/Build.UnitTests/BackEnd/BuildCoordinator_Tests.cs new file mode 100644 index 00000000000..c12026c7f58 --- /dev/null +++ b/src/Build.UnitTests/BackEnd/BuildCoordinator_Tests.cs @@ -0,0 +1,884 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +#if NET + +using System; +using System.Collections.Generic; +using System.IO; +using System.IO.Pipes; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Build.BackEnd; +using Microsoft.Build.Shared; +using Shouldly; +using Xunit; +using Xunit.Abstractions; + +namespace Microsoft.Build.UnitTests.BackEnd +{ + /// + /// Tests for BuildCoordinator and BuildCoordinatorClient. + /// These are integration tests that spin up a real coordinator over named pipes. + /// + public class BuildCoordinator_Tests : IDisposable + { + private readonly ITestOutputHelper _output; + private readonly string _testPipeName; + + public BuildCoordinator_Tests(ITestOutputHelper output) + { + _output = output; + // Use a unique pipe name per test run to avoid collisions + _testPipeName = NativeMethodsShared.IsUnixLike + ? $"/tmp/MSBuild-CoordinatorTest-{Guid.NewGuid():N}" + : $"MSBuild-CoordinatorTest-{Guid.NewGuid():N}"; + } + + public void Dispose() + { + // Clean up pipe file on Unix + if (NativeMethodsShared.IsUnixLike && File.Exists(_testPipeName)) + { + try { File.Delete(_testPipeName); } + catch { } + } + } + + #region Coordinator Protocol Tests + + [Fact] + public void FirstBuild_GetsFullBudget() + { + using var coordinator = new BuildCoordinator(12, maxConcurrentBuilds: 3); + coordinator.Start(); + + try + { + string? response = SendRawCommand("REGISTER build-1 6"); + response.ShouldNotBeNull(); + response.ShouldStartWith("OK "); + + // First build should get full budget (min of requested and total) + int granted = int.Parse(response!.Split(' ')[1]); + granted.ShouldBe(6); + } + finally + { + coordinator.Stop(); + } + } + + [Fact] + public void FirstBuild_GetsCappedAtTotalBudget() + { + using var coordinator = new BuildCoordinator(8, maxConcurrentBuilds: 3); + coordinator.Start(); + + try + { + // Request more than total budget + string? response = SendRawCommand("REGISTER build-1 20"); + response.ShouldNotBeNull(); + response.ShouldStartWith("OK "); + + int granted = int.Parse(response!.Split(' ')[1]); + granted.ShouldBeLessThanOrEqualTo(8); + } + finally + { + coordinator.Stop(); + } + } + + [Fact] + public void SecondBuild_GetsQueued() + { + using var coordinator = new BuildCoordinator(12, maxConcurrentBuilds: 1); + coordinator.Start(); + + try + { + // First build — immediate + string? r1 = SendRawCommand("REGISTER build-1 6"); + r1.ShouldStartWith("OK "); + + // Second build — queued (max concurrent = 1) + string? r2 = SendRawCommand("REGISTER build-2 6"); + r2.ShouldNotBeNull(); + r2.ShouldStartWith("QUEUED "); + + string[] parts = r2!.Split(' '); + int position = int.Parse(parts[1]); + position.ShouldBe(1); // First in queue + } + finally + { + coordinator.Stop(); + } + } + + [Fact] + public void Heartbeat_ReturnsCurrentBudget() + { + using var coordinator = new BuildCoordinator(12, maxConcurrentBuilds: 3); + coordinator.Start(); + + try + { + SendRawCommand("REGISTER build-1 6"); + + string? hb = SendRawCommand("HEARTBEAT build-1"); + hb.ShouldNotBeNull(); + hb.ShouldStartWith("OK "); + + int budget = int.Parse(hb!.Split(' ')[1]); + budget.ShouldBeGreaterThan(0); + } + finally + { + coordinator.Stop(); + } + } + + [Fact] + public void Heartbeat_ForQueuedBuild_ReturnsQueuePosition() + { + using var coordinator = new BuildCoordinator(12, maxConcurrentBuilds: 1); + coordinator.Start(); + + try + { + SendRawCommand("REGISTER build-1 6"); + SendRawCommand("REGISTER build-2 6"); + + string? hb = SendRawCommand("HEARTBEAT build-2"); + hb.ShouldNotBeNull(); + hb.ShouldStartWith("QUEUED "); + } + finally + { + coordinator.Stop(); + } + } + + [Fact] + public void Unregister_RemovesActiveBuild() + { + using var coordinator = new BuildCoordinator(12, maxConcurrentBuilds: 3); + coordinator.Start(); + + try + { + SendRawCommand("REGISTER build-1 6"); + string? unreg = SendRawCommand("UNREGISTER build-1"); + unreg.ShouldNotBeNull(); + unreg.ShouldStartWith("OK"); + + // Status should show 0 active + string? status = SendRawCommand("STATUS"); + status.ShouldNotBeNull(); + status.ShouldContain("active=0"); + } + finally + { + coordinator.Stop(); + } + } + + [Fact] + public void Unregister_PromotesQueuedBuild() + { + using var coordinator = new BuildCoordinator(12, maxConcurrentBuilds: 1); + coordinator.Start(); + + try + { + SendRawCommand("REGISTER build-1 6"); + SendRawCommand("REGISTER build-2 6"); + + // Unregister first build — should promote build-2 + string? unreg = SendRawCommand("UNREGISTER build-1"); + unreg.ShouldNotBeNull(); + unreg.ShouldContain("promoted"); + unreg.ShouldContain("build-2"); + + // build-2 should now get OK on heartbeat (it's active) + string? hb = SendRawCommand("HEARTBEAT build-2"); + hb.ShouldStartWith("OK "); + } + finally + { + coordinator.Stop(); + } + } + + [Fact] + public void BudgetRebalances_WhenSecondBuildJoins() + { + using var coordinator = new BuildCoordinator(12, maxConcurrentBuilds: 3); + coordinator.Start(); + + try + { + // First build gets full budget + string? r1 = SendRawCommand("REGISTER build-1 12"); + int firstGrant = int.Parse(r1!.Split(' ')[1]); + firstGrant.ShouldBe(12); + + // Second build joins + SendRawCommand("REGISTER build-2 12"); + + // First build heartbeats — should get reduced budget (acknowledges epoch) + string? hb1 = SendRawCommand("HEARTBEAT build-1"); + int newBudget = int.Parse(hb1!.Split(' ')[1]); + newBudget.ShouldBe(6); // 12 / 2 builds = 6 each + + // Second build should now be promoted after build-1 acknowledged + // Heartbeat for build-2 should return OK (promoted) + string? hb2 = SendRawCommand("HEARTBEAT build-2"); + hb2.ShouldStartWith("OK "); + int budget2 = int.Parse(hb2!.Split(' ')[1]); + budget2.ShouldBe(6); + } + finally + { + coordinator.Stop(); + } + } + + [Fact] + public void BudgetIncreases_WhenBuildLeaves() + { + using var coordinator = new BuildCoordinator(12, maxConcurrentBuilds: 3); + coordinator.Start(); + + try + { + SendRawCommand("REGISTER build-1 12"); + SendRawCommand("REGISTER build-2 12"); + + // Acknowledge epoch so build-2 promotes + SendRawCommand("HEARTBEAT build-1"); + SendRawCommand("HEARTBEAT build-2"); + + // Now unregister build-2 + SendRawCommand("UNREGISTER build-2"); + + // build-1 should get full budget back + string? hb = SendRawCommand("HEARTBEAT build-1"); + int budget = int.Parse(hb!.Split(' ')[1]); + budget.ShouldBe(12); + } + finally + { + coordinator.Stop(); + } + } + + [Fact] + public void Status_ReturnsCorrectSummary() + { + using var coordinator = new BuildCoordinator(12, maxConcurrentBuilds: 2); + coordinator.Start(); + + try + { + SendRawCommand("REGISTER build-1 6"); + SendRawCommand("REGISTER build-2 6"); + SendRawCommand("REGISTER build-3 6"); // Will be queued + + // Acknowledge so build-2 promotes + SendRawCommand("HEARTBEAT build-1"); + + string? status = SendRawCommand("STATUS"); + status.ShouldNotBeNull(); + status.ShouldContain("budget=12"); + status.ShouldContain("max=2"); + } + finally + { + coordinator.Stop(); + } + } + + [Fact] + public void Shutdown_StopsCoordinator() + { + using var coordinator = new BuildCoordinator(12, maxConcurrentBuilds: 3); + coordinator.Start(); + + string? response = SendRawCommand("SHUTDOWN"); + response.ShouldBe("OK"); + + // Coordinator should stop — WaitForShutdown should return + coordinator.WaitForShutdown(); + } + + [Fact] + public void UnknownCommand_ReturnsError() + { + using var coordinator = new BuildCoordinator(12, maxConcurrentBuilds: 3); + coordinator.Start(); + + try + { + string? response = SendRawCommand("INVALID_CMD"); + response.ShouldNotBeNull(); + response.ShouldStartWith("ERR"); + } + finally + { + coordinator.Stop(); + } + } + + [Fact] + public void Register_WithInvalidArgs_ReturnsError() + { + using var coordinator = new BuildCoordinator(12, maxConcurrentBuilds: 3); + coordinator.Start(); + + try + { + string? response = SendRawCommand("REGISTER"); + response.ShouldNotBeNull(); + response.ShouldStartWith("ERR"); + } + finally + { + coordinator.Stop(); + } + } + + [Fact] + public void EpochGating_PreventsPromotionBeforeAcknowledgment() + { + using var coordinator = new BuildCoordinator(12, maxConcurrentBuilds: 3); + coordinator.Start(); + + try + { + SendRawCommand("REGISTER build-1 12"); + SendRawCommand("REGISTER build-2 12"); // Queued — epoch bumped + + // build-2 heartbeats BEFORE build-1 acknowledges — should still be queued + string? hb2 = SendRawCommand("HEARTBEAT build-2"); + hb2.ShouldStartWith("QUEUED "); + + // Now build-1 acknowledges via heartbeat + SendRawCommand("HEARTBEAT build-1"); + + // build-2 should now be promoted + string? hb2After = SendRawCommand("HEARTBEAT build-2"); + hb2After.ShouldStartWith("OK "); + } + finally + { + coordinator.Stop(); + } + } + + [Fact] + public void MaxConcurrency_EnforcesLimit() + { + using var coordinator = new BuildCoordinator(12, maxConcurrentBuilds: 2); + coordinator.Start(); + + try + { + SendRawCommand("REGISTER build-1 6"); + SendRawCommand("REGISTER build-2 6"); + + // Acknowledge so build-2 promotes + SendRawCommand("HEARTBEAT build-1"); + + // Third build should be queued (max=2) + SendRawCommand("REGISTER build-3 6"); + string? hb3 = SendRawCommand("HEARTBEAT build-3"); + hb3.ShouldStartWith("QUEUED "); + + // Even after all heartbeats, build-3 stays queued + SendRawCommand("HEARTBEAT build-1"); + SendRawCommand("HEARTBEAT build-2"); + hb3 = SendRawCommand("HEARTBEAT build-3"); + hb3.ShouldStartWith("QUEUED "); + } + finally + { + coordinator.Stop(); + } + } + + [Fact] + public void FairShare_DistributesBudgetEvenly() + { + using var coordinator = new BuildCoordinator(12, maxConcurrentBuilds: 3); + coordinator.Start(); + + try + { + // Register 3 builds and get them all active + SendRawCommand("REGISTER build-1 12"); + SendRawCommand("REGISTER build-2 12"); + SendRawCommand("HEARTBEAT build-1"); // Acknowledge for build-2 promotion + SendRawCommand("REGISTER build-3 12"); + SendRawCommand("HEARTBEAT build-1"); // Acknowledge for build-3 + SendRawCommand("HEARTBEAT build-2"); // Acknowledge for build-3 + + // All three should get 4 nodes each (12 / 3) + string? hb1 = SendRawCommand("HEARTBEAT build-1"); + string? hb2 = SendRawCommand("HEARTBEAT build-2"); + string? hb3 = SendRawCommand("HEARTBEAT build-3"); + + int b1 = int.Parse(hb1!.Split(' ')[1]); + int b2 = int.Parse(hb2!.Split(' ')[1]); + int b3 = int.Parse(hb3!.Split(' ')[1]); + + b1.ShouldBe(4); + b2.ShouldBe(4); + b3.ShouldBe(4); + } + finally + { + coordinator.Stop(); + } + } + + [Fact] + public void FairShare_CapsAtRequestedNodes() + { + using var coordinator = new BuildCoordinator(12, maxConcurrentBuilds: 3); + coordinator.Start(); + + try + { + // build-1 only wants 2 nodes + SendRawCommand("REGISTER build-1 2"); + SendRawCommand("REGISTER build-2 12"); + SendRawCommand("HEARTBEAT build-1"); + + // build-1 should get 2 (capped at requested), build-2 gets 6 (fair share) + string? hb1 = SendRawCommand("HEARTBEAT build-1"); + string? hb2 = SendRawCommand("HEARTBEAT build-2"); + + int b1 = int.Parse(hb1!.Split(' ')[1]); + int b2 = int.Parse(hb2!.Split(' ')[1]); + + b1.ShouldBe(2); + b2.ShouldBe(6); + } + finally + { + coordinator.Stop(); + } + } + + #endregion + + #region Client Tests + + [Fact] + public void Client_NoCoordinator_TryRegisterReturnsFalse() + { + // This test verifies that when no coordinator is listening on the expected pipe, + // the client gracefully returns false. We use a custom pipe name that doesn't exist. + // Since BuildCoordinatorClient always uses GetPipeName(), we can't easily redirect it. + // Instead, verify that SendCommand returns null for a nonexistent pipe by checking + // that a raw connection to a bogus pipe fails. + string bogusPipe = NativeMethodsShared.IsUnixLike + ? $"/tmp/MSBuild-NoPipe-{Guid.NewGuid():N}" + : $"MSBuild-NoPipe-{Guid.NewGuid():N}"; + + bool connected = false; + try + { + using var client = new System.IO.Pipes.NamedPipeClientStream(".", bogusPipe, PipeDirection.InOut, PipeOptions.CurrentUserOnly); + client.Connect(500); + connected = true; + } + catch (TimeoutException) + { + // Expected — no server + } + catch (IOException) + { + // Expected — no server + } + + connected.ShouldBeFalse("Connection to nonexistent pipe should fail"); + } + + [Fact] + public void Client_RegistersWithCoordinator() + { + using var coordinator = new BuildCoordinator(12, maxConcurrentBuilds: 3); + coordinator.Start(); + + try + { + using var client = new BuildCoordinatorClient(); + bool registered = client.TryRegister(6, out int granted); + + registered.ShouldBeTrue(); + client.IsConnected.ShouldBeTrue(); + granted.ShouldBe(6); + client.GrantedNodes.ShouldBe(6); + } + finally + { + coordinator.Stop(); + } + } + + [Fact] + public void Client_HeartbeatUpdatesbudget() + { + using var coordinator = new BuildCoordinator(12, maxConcurrentBuilds: 3); + coordinator.Start(); + + try + { + using var client1 = new BuildCoordinatorClient(); + client1.TryRegister(12, out _); + + int? budgetUpdate = null; + using var budgetChanged = new ManualResetEventSlim(false); + + client1.StartHeartbeat(newBudget => + { + budgetUpdate = newBudget; + budgetChanged.Set(); + }); + + // Register a second build via raw protocol to trigger rebalance + SendRawCommand($"REGISTER second-build 12"); + // Heartbeat will fire every 2s and get the new budget + // Wait up to 10s for the budget change callback + budgetChanged.Wait(TimeSpan.FromSeconds(10)).ShouldBeTrue("Budget change callback should fire"); + budgetUpdate.ShouldBe(6); // 12 / 2 = 6 + } + finally + { + coordinator.Stop(); + } + } + + [Fact] + public void Client_UnregisterCleansUp() + { + using var coordinator = new BuildCoordinator(12, maxConcurrentBuilds: 3); + coordinator.Start(); + + try + { + var client = new BuildCoordinatorClient(); + client.TryRegister(6, out _); + client.IsConnected.ShouldBeTrue(); + + client.Dispose(); // Triggers Unregister + + // Coordinator should show 0 active + string? status = SendRawCommand("STATUS"); + status!.ShouldContain("active=0"); + } + finally + { + coordinator.Stop(); + } + } + + [Fact] + public void Client_QueuedBuild_BlocksUntilPromoted() + { + using var coordinator = new BuildCoordinator(12, maxConcurrentBuilds: 1); + coordinator.Start(); + + try + { + // First build registers immediately + using var client1 = new BuildCoordinatorClient(); + client1.TryRegister(6, out _); + + // Start heartbeats so client1 acknowledges epochs + client1.StartHeartbeat(_ => { }); + + // Second build should block in queue + var queuePositions = new List(); + using var client2 = new BuildCoordinatorClient(); + + var registerTask = Task.Run(() => + { + return client2.TryRegister(6, out _, onQueuePositionChanged: (pos, total, wait) => + { + _output.WriteLine($"Queue position: {pos}/{total}, waiting {wait}s"); + queuePositions.Add(pos); + }); + }); + + // Give client2 time to register and start heartbeating in queue + Thread.Sleep(3000); + registerTask.IsCompleted.ShouldBeFalse("Build should still be queued"); + + // Unregister first build — should promote second + client1.Dispose(); + + // Second build should complete registration + bool registered = registerTask.Wait(TimeSpan.FromSeconds(15)) + ? registerTask.Result + : false; + + registered.ShouldBeTrue("Queued build should be promoted after first build unregisters"); + client2.IsConnected.ShouldBeTrue(); + queuePositions.ShouldNotBeEmpty(); + } + finally + { + coordinator.Stop(); + } + } + + [Fact] + public void Client_QueuedBuild_CancellationUnregisters() + { + using var coordinator = new BuildCoordinator(12, maxConcurrentBuilds: 1); + coordinator.Start(); + + try + { + using var client1 = new BuildCoordinatorClient(); + client1.TryRegister(6, out _); + + using var cts = new CancellationTokenSource(); + using var client2 = new BuildCoordinatorClient(); + + var registerTask = Task.Run(() => + client2.TryRegister(6, out _, ct: cts.Token)); + + // Let it enter queue + Thread.Sleep(3000); + + // Cancel + cts.Cancel(); + + bool registered = registerTask.Wait(TimeSpan.FromSeconds(10)) + ? registerTask.Result + : true; // If timed out, fail + + registered.ShouldBeFalse("Cancelled registration should return false"); + } + finally + { + coordinator.Stop(); + } + } + + #endregion + + #region GetPipeName Tests + + [Fact] + public void GetPipeName_ContainsUsername() + { + string pipeName = BuildCoordinator.GetPipeName(); + pipeName.ShouldContain(Environment.UserName); + } + + [Fact] + public void GetPipeName_OnUnix_IsAbsolutePath() + { + if (!NativeMethodsShared.IsUnixLike) + { + return; // Skip on Windows + } + + string pipeName = BuildCoordinator.GetPipeName(); + pipeName.ShouldStartWith("/tmp/"); + } + + #endregion + + #region Staleness Reaper Tests + + [Fact] + public void StalenessReaper_ReapsDeadBuild() + { + using var coordinator = new BuildCoordinator(12, maxConcurrentBuilds: 3); + coordinator.Start(); + + try + { + // Register a build with a PID that definitely doesn't exist. + // Build ID format is "{PID}-{ticks}" — use PID 99999999 + string deadBuildId = "99999999-123456789"; + string? r = SendRawCommand($"REGISTER {deadBuildId} 6"); + r.ShouldStartWith("OK "); + + // Verify it's active + string? status1 = SendRawCommand("STATUS"); + status1!.ShouldContain("active=1"); + + // Wait for the staleness reaper to detect it (10s stale + 5s reap interval) + // The reaper checks every 5s and requires heartbeat to be stale for 10s + Thread.Sleep(18000); + + // Build should have been reaped + string? status2 = SendRawCommand("STATUS"); + status2!.ShouldContain("active=0"); + } + finally + { + coordinator.Stop(); + } + } + + [Fact] + public void StalenessReaper_DoesNotReapLiveBuild() + { + using var coordinator = new BuildCoordinator(12, maxConcurrentBuilds: 3); + coordinator.Start(); + + try + { + // Register with current PID — process is definitely alive + string liveBuildId = $"{Environment.ProcessId}-{DateTime.UtcNow.Ticks}"; + SendRawCommand($"REGISTER {liveBuildId} 6"); + + // Wait past the stale threshold but keep heartbeating + for (int i = 0; i < 4; i++) + { + Thread.Sleep(3000); + SendRawCommand($"HEARTBEAT {liveBuildId}"); + } + + // Build should still be active + string? status = SendRawCommand("STATUS"); + status!.ShouldContain("active=1"); + } + finally + { + coordinator.Stop(); + } + } + + [Fact] + public void StalenessReaper_PromotesQueuedAfterReap() + { + using var coordinator = new BuildCoordinator(12, maxConcurrentBuilds: 1); + coordinator.Start(); + + try + { + // First build — dead PID + string deadBuildId = "99999999-111111"; + SendRawCommand($"REGISTER {deadBuildId} 6"); + + // Second build — queued (max concurrent = 1) + string liveBuildId = $"{Environment.ProcessId}-{DateTime.UtcNow.Ticks}"; + SendRawCommand($"REGISTER {liveBuildId} 6"); + + // Verify: 1 active, 1 queued + string? status1 = SendRawCommand("STATUS"); + status1!.ShouldContain("active=1"); + + // Wait for reaper to kill the dead build and promote the queued one + Thread.Sleep(18000); + + // Live build should now be active + string? hb = SendRawCommand($"HEARTBEAT {liveBuildId}"); + hb.ShouldStartWith("OK "); + + string? status2 = SendRawCommand("STATUS"); + status2!.ShouldContain("active=1"); + } + finally + { + coordinator.Stop(); + } + } + + #endregion + + #region Concurrent Connection Tests + + [Fact] + public void ConcurrentRegistrations_AllSucceed() + { + using var coordinator = new BuildCoordinator(24, maxConcurrentBuilds: 5); + coordinator.Start(); + + try + { + int successCount = 0; + var tasks = new Task[5]; + + for (int i = 0; i < 5; i++) + { + int buildNum = i; + tasks[i] = Task.Run(() => + { + string? response = SendRawCommand($"REGISTER concurrent-{buildNum} 6"); + if (response != null && (response.StartsWith("OK ") || response.StartsWith("QUEUED "))) + { + Interlocked.Increment(ref successCount); + } + }); + } + + Task.WaitAll(tasks, TimeSpan.FromSeconds(15)); + successCount.ShouldBe(5); + } + finally + { + coordinator.Stop(); + } + } + + #endregion + + #region Helpers + + /// + /// Send a raw command to the coordinator using the well-known pipe name. + /// + private string? SendRawCommand(string command) + { + string pipeName = BuildCoordinator.GetPipeName(); + + for (int attempt = 0; attempt < 3; attempt++) + { + try + { + using var client = new NamedPipeClientStream(".", pipeName, PipeDirection.InOut, PipeOptions.CurrentUserOnly); + client.Connect(2000); + + using var writer = new StreamWriter(client, leaveOpen: true) { AutoFlush = true }; + using var reader = new StreamReader(client, leaveOpen: true); + + writer.WriteLine(command); + return reader.ReadLine(); + } + catch (TimeoutException) + { + if (attempt < 2) + { + Thread.Sleep(500); + } + } + catch (IOException) + { + if (attempt < 2) + { + Thread.Sleep(500); + } + } + } + + return null; + } + + #endregion + } +} + +#endif diff --git a/src/Build.UnitTests/BackEnd/HashBasedPipeNaming_Tests.cs b/src/Build.UnitTests/BackEnd/HashBasedPipeNaming_Tests.cs new file mode 100644 index 00000000000..d996b66f3c3 --- /dev/null +++ b/src/Build.UnitTests/BackEnd/HashBasedPipeNaming_Tests.cs @@ -0,0 +1,245 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.IO; +using Microsoft.Build.Internal; +using Microsoft.Build.Shared; +using Shouldly; +using Xunit; + +#nullable disable + +namespace Microsoft.Build.UnitTests +{ + /// + /// Tests for hash-based pipe naming in NamedPipeUtil and Handshake.ComputeHash. + /// + public class HashBasedPipeNaming_Tests + { + #region ComputeHash Tests + + [Fact] + public void ComputeHash_ReturnsDeterministicValue() + { + var handshake = new Handshake(HandshakeOptions.NodeReuse); + string hash1 = handshake.ComputeHash(); + string hash2 = handshake.ComputeHash(); + + hash1.ShouldNotBeNullOrEmpty(); + hash2.ShouldNotBeNullOrEmpty(); + hash1.ShouldBe(hash2); + } + + [Fact] + public void ComputeHash_SameOptionsSameHash() + { + var h1 = new Handshake(HandshakeOptions.NodeReuse); + var h2 = new Handshake(HandshakeOptions.NodeReuse); + + h1.ComputeHash().ShouldBe(h2.ComputeHash()); + } + + [Fact] + public void ComputeHash_DifferentOptionsYieldDifferentHash() + { + var h1 = new Handshake(HandshakeOptions.NodeReuse); + var h2 = new Handshake(HandshakeOptions.None); + + h1.ComputeHash().ShouldNotBe(h2.ComputeHash()); + } + + [Fact] + public void ComputeHash_NoPaddingOrSlashes() + { + var handshake = new Handshake(HandshakeOptions.NodeReuse); + string hash = handshake.ComputeHash(); + + // Hash should be URL/filename-safe: no / or = characters + hash.ShouldNotContain("/"); + hash.ShouldNotContain("="); + } + + [Fact] + public void ComputeHash_IsCached() + { + var handshake = new Handshake(HandshakeOptions.NodeReuse); + string hash1 = handshake.ComputeHash(); + string hash2 = handshake.ComputeHash(); + + // Should be the exact same object reference (cached) + ReferenceEquals(hash1, hash2).ShouldBeTrue(); + } + + #endregion + + #region GetHashBasedPipeName Tests + + [Fact] + public void GetHashBasedPipeName_ContainsHashAndPid() + { + string hash = "abc123"; + int pid = 42; + string pipeName = NamedPipeUtil.GetHashBasedPipeName(hash, pid); + + pipeName.ShouldContain("MSBuild-abc123-42"); + } + + [Fact] + public void GetHashBasedPipeName_DefaultsToCurrentPid() + { + string hash = "testhash"; + string pipeName = NamedPipeUtil.GetHashBasedPipeName(hash); + + int currentPid = EnvironmentUtilities.CurrentProcessId; + pipeName.ShouldContain($"MSBuild-testhash-{currentPid}"); + } + + [Fact] + public void GetHashBasedPipeName_OnUnix_IsAbsolutePath() + { + if (!NativeMethodsShared.IsUnixLike) + { + return; + } + + string pipeName = NamedPipeUtil.GetHashBasedPipeName("hash", 123); + pipeName.ShouldStartWith("/tmp/"); + } + + #endregion + + #region FindNodesByHandshakeHash Tests + + [Fact] + public void FindNodesByHandshakeHash_ReturnsEmptyOnWindows() + { + if (NativeMethodsShared.IsUnixLike) + { + return; // Only test on Windows + } + + var pids = NamedPipeUtil.FindNodesByHandshakeHash("nonexistent"); + pids.ShouldBeEmpty(); + } + + [Fact] + public void FindNodesByHandshakeHash_FindsMatchingPipeFiles() + { + if (!NativeMethodsShared.IsUnixLike) + { + return; // Only works on Unix where pipes are files + } + + string testHash = $"test-{Guid.NewGuid():N}"; + + // Create fake pipe files in /tmp + string pipe1 = $"/tmp/MSBuild-{testHash}-1001"; + string pipe2 = $"/tmp/MSBuild-{testHash}-1002"; + string pipeOther = $"/tmp/MSBuild-otherhash-9999"; + + try + { + File.WriteAllText(pipe1, ""); + File.WriteAllText(pipe2, ""); + File.WriteAllText(pipeOther, ""); + + var pids = NamedPipeUtil.FindNodesByHandshakeHash(testHash); + + pids.ShouldContain(1001); + pids.ShouldContain(1002); + pids.ShouldNotContain(9999); + } + finally + { + File.Delete(pipe1); + File.Delete(pipe2); + File.Delete(pipeOther); + } + } + + [Fact] + public void FindNodesByHandshakeHash_IgnoresMalformedFileNames() + { + if (!NativeMethodsShared.IsUnixLike) + { + return; + } + + string testHash = $"test-{Guid.NewGuid():N}"; + string pipeGood = $"/tmp/MSBuild-{testHash}-5555"; + string pipeBad = $"/tmp/MSBuild-{testHash}-notanumber"; + + try + { + File.WriteAllText(pipeGood, ""); + File.WriteAllText(pipeBad, ""); + + var pids = NamedPipeUtil.FindNodesByHandshakeHash(testHash); + + pids.ShouldContain(5555); + pids.Count.ShouldBe(1); + } + finally + { + File.Delete(pipeGood); + File.Delete(pipeBad); + } + } + + [Fact] + public void FindNodesByHandshakeHash_ReturnsEmptyWhenNoMatches() + { + if (!NativeMethodsShared.IsUnixLike) + { + return; + } + + var pids = NamedPipeUtil.FindNodesByHandshakeHash($"nopipes-{Guid.NewGuid():N}"); + pids.ShouldBeEmpty(); + } + + #endregion + + #region SessionId Fix Tests + + [Fact] + public void Handshake_OnUnix_SessionIdIsZero() + { + if (!NativeMethodsShared.IsUnixLike) + { + return; + } + + // Two handshakes created from different contexts should have the same + // session ID (0) on Unix, enabling cross-terminal node reuse. + var h1 = new Handshake(HandshakeOptions.NodeReuse); + var h2 = new Handshake(HandshakeOptions.NodeReuse); + + // Same handshake key means same session ID was used + h1.GetKey().ShouldBe(h2.GetKey()); + } + + [Fact] + public void Handshake_SessionIdComponent_IsZeroOnUnix() + { + if (!NativeMethodsShared.IsUnixLike) + { + return; + } + + var handshake = new Handshake(HandshakeOptions.NodeReuse); + var components = handshake.RetrieveHandshakeComponents(); + + // SessionId should be 0 on Unix (may be transformed by AvoidEndOfHandshakeSignal) + // The key representation includes the raw session id + string key = handshake.GetKey(); + // Key format: "options salt major minor build private sessionId" + // Last component should be 0 + string[] keyParts = key.Split(' '); + keyParts[keyParts.Length - 1].ShouldBe("0"); + } + + #endregion + } +} diff --git a/src/Shared/CommunicationsUtilities.cs b/src/Shared/CommunicationsUtilities.cs index a35b8206d43..96e3191ec3f 100644 --- a/src/Shared/CommunicationsUtilities.cs +++ b/src/Shared/CommunicationsUtilities.cs @@ -416,7 +416,7 @@ internal static class CommunicationsUtilities /// /// The timeout to connect to a node. /// - private const int DefaultNodeConnectionTimeout = 900 * 1000; // 15 minutes; enough time that a dev will typically do another build in this time + private const int DefaultNodeConnectionTimeout = 30 * 1000; // 30 seconds /// /// Whether to trace communications