diff --git a/src/Build.UnitTests/BackEnd/BuildCoordinator_Tests.cs b/src/Build.UnitTests/BackEnd/BuildCoordinator_Tests.cs
new file mode 100644
index 00000000000..c12026c7f58
--- /dev/null
+++ b/src/Build.UnitTests/BackEnd/BuildCoordinator_Tests.cs
@@ -0,0 +1,884 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+#if NET
+
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.IO.Pipes;
+using System.Threading;
+using System.Threading.Tasks;
+using Microsoft.Build.BackEnd;
+using Microsoft.Build.Shared;
+using Shouldly;
+using Xunit;
+using Xunit.Abstractions;
+
+namespace Microsoft.Build.UnitTests.BackEnd
+{
+ ///
+ /// Tests for BuildCoordinator and BuildCoordinatorClient.
+ /// These are integration tests that spin up a real coordinator over named pipes.
+ ///
+ public class BuildCoordinator_Tests : IDisposable
+ {
+ private readonly ITestOutputHelper _output;
+ private readonly string _testPipeName;
+
+ public BuildCoordinator_Tests(ITestOutputHelper output)
+ {
+ _output = output;
+ // Use a unique pipe name per test run to avoid collisions
+ _testPipeName = NativeMethodsShared.IsUnixLike
+ ? $"/tmp/MSBuild-CoordinatorTest-{Guid.NewGuid():N}"
+ : $"MSBuild-CoordinatorTest-{Guid.NewGuid():N}";
+ }
+
+ public void Dispose()
+ {
+ // Clean up pipe file on Unix
+ if (NativeMethodsShared.IsUnixLike && File.Exists(_testPipeName))
+ {
+ try { File.Delete(_testPipeName); }
+ catch { }
+ }
+ }
+
+ #region Coordinator Protocol Tests
+
+ [Fact]
+ public void FirstBuild_GetsFullBudget()
+ {
+ using var coordinator = new BuildCoordinator(12, maxConcurrentBuilds: 3);
+ coordinator.Start();
+
+ try
+ {
+ string? response = SendRawCommand("REGISTER build-1 6");
+ response.ShouldNotBeNull();
+ response.ShouldStartWith("OK ");
+
+ // First build should get full budget (min of requested and total)
+ int granted = int.Parse(response!.Split(' ')[1]);
+ granted.ShouldBe(6);
+ }
+ finally
+ {
+ coordinator.Stop();
+ }
+ }
+
+ [Fact]
+ public void FirstBuild_GetsCappedAtTotalBudget()
+ {
+ using var coordinator = new BuildCoordinator(8, maxConcurrentBuilds: 3);
+ coordinator.Start();
+
+ try
+ {
+ // Request more than total budget
+ string? response = SendRawCommand("REGISTER build-1 20");
+ response.ShouldNotBeNull();
+ response.ShouldStartWith("OK ");
+
+ int granted = int.Parse(response!.Split(' ')[1]);
+ granted.ShouldBeLessThanOrEqualTo(8);
+ }
+ finally
+ {
+ coordinator.Stop();
+ }
+ }
+
+ [Fact]
+ public void SecondBuild_GetsQueued()
+ {
+ using var coordinator = new BuildCoordinator(12, maxConcurrentBuilds: 1);
+ coordinator.Start();
+
+ try
+ {
+ // First build — immediate
+ string? r1 = SendRawCommand("REGISTER build-1 6");
+ r1.ShouldStartWith("OK ");
+
+ // Second build — queued (max concurrent = 1)
+ string? r2 = SendRawCommand("REGISTER build-2 6");
+ r2.ShouldNotBeNull();
+ r2.ShouldStartWith("QUEUED ");
+
+ string[] parts = r2!.Split(' ');
+ int position = int.Parse(parts[1]);
+ position.ShouldBe(1); // First in queue
+ }
+ finally
+ {
+ coordinator.Stop();
+ }
+ }
+
+ [Fact]
+ public void Heartbeat_ReturnsCurrentBudget()
+ {
+ using var coordinator = new BuildCoordinator(12, maxConcurrentBuilds: 3);
+ coordinator.Start();
+
+ try
+ {
+ SendRawCommand("REGISTER build-1 6");
+
+ string? hb = SendRawCommand("HEARTBEAT build-1");
+ hb.ShouldNotBeNull();
+ hb.ShouldStartWith("OK ");
+
+ int budget = int.Parse(hb!.Split(' ')[1]);
+ budget.ShouldBeGreaterThan(0);
+ }
+ finally
+ {
+ coordinator.Stop();
+ }
+ }
+
+ [Fact]
+ public void Heartbeat_ForQueuedBuild_ReturnsQueuePosition()
+ {
+ using var coordinator = new BuildCoordinator(12, maxConcurrentBuilds: 1);
+ coordinator.Start();
+
+ try
+ {
+ SendRawCommand("REGISTER build-1 6");
+ SendRawCommand("REGISTER build-2 6");
+
+ string? hb = SendRawCommand("HEARTBEAT build-2");
+ hb.ShouldNotBeNull();
+ hb.ShouldStartWith("QUEUED ");
+ }
+ finally
+ {
+ coordinator.Stop();
+ }
+ }
+
+ [Fact]
+ public void Unregister_RemovesActiveBuild()
+ {
+ using var coordinator = new BuildCoordinator(12, maxConcurrentBuilds: 3);
+ coordinator.Start();
+
+ try
+ {
+ SendRawCommand("REGISTER build-1 6");
+ string? unreg = SendRawCommand("UNREGISTER build-1");
+ unreg.ShouldNotBeNull();
+ unreg.ShouldStartWith("OK");
+
+ // Status should show 0 active
+ string? status = SendRawCommand("STATUS");
+ status.ShouldNotBeNull();
+ status.ShouldContain("active=0");
+ }
+ finally
+ {
+ coordinator.Stop();
+ }
+ }
+
+ [Fact]
+ public void Unregister_PromotesQueuedBuild()
+ {
+ using var coordinator = new BuildCoordinator(12, maxConcurrentBuilds: 1);
+ coordinator.Start();
+
+ try
+ {
+ SendRawCommand("REGISTER build-1 6");
+ SendRawCommand("REGISTER build-2 6");
+
+ // Unregister first build — should promote build-2
+ string? unreg = SendRawCommand("UNREGISTER build-1");
+ unreg.ShouldNotBeNull();
+ unreg.ShouldContain("promoted");
+ unreg.ShouldContain("build-2");
+
+ // build-2 should now get OK on heartbeat (it's active)
+ string? hb = SendRawCommand("HEARTBEAT build-2");
+ hb.ShouldStartWith("OK ");
+ }
+ finally
+ {
+ coordinator.Stop();
+ }
+ }
+
+ [Fact]
+ public void BudgetRebalances_WhenSecondBuildJoins()
+ {
+ using var coordinator = new BuildCoordinator(12, maxConcurrentBuilds: 3);
+ coordinator.Start();
+
+ try
+ {
+ // First build gets full budget
+ string? r1 = SendRawCommand("REGISTER build-1 12");
+ int firstGrant = int.Parse(r1!.Split(' ')[1]);
+ firstGrant.ShouldBe(12);
+
+ // Second build joins
+ SendRawCommand("REGISTER build-2 12");
+
+ // First build heartbeats — should get reduced budget (acknowledges epoch)
+ string? hb1 = SendRawCommand("HEARTBEAT build-1");
+ int newBudget = int.Parse(hb1!.Split(' ')[1]);
+ newBudget.ShouldBe(6); // 12 / 2 builds = 6 each
+
+ // Second build should now be promoted after build-1 acknowledged
+ // Heartbeat for build-2 should return OK (promoted)
+ string? hb2 = SendRawCommand("HEARTBEAT build-2");
+ hb2.ShouldStartWith("OK ");
+ int budget2 = int.Parse(hb2!.Split(' ')[1]);
+ budget2.ShouldBe(6);
+ }
+ finally
+ {
+ coordinator.Stop();
+ }
+ }
+
+ [Fact]
+ public void BudgetIncreases_WhenBuildLeaves()
+ {
+ using var coordinator = new BuildCoordinator(12, maxConcurrentBuilds: 3);
+ coordinator.Start();
+
+ try
+ {
+ SendRawCommand("REGISTER build-1 12");
+ SendRawCommand("REGISTER build-2 12");
+
+ // Acknowledge epoch so build-2 promotes
+ SendRawCommand("HEARTBEAT build-1");
+ SendRawCommand("HEARTBEAT build-2");
+
+ // Now unregister build-2
+ SendRawCommand("UNREGISTER build-2");
+
+ // build-1 should get full budget back
+ string? hb = SendRawCommand("HEARTBEAT build-1");
+ int budget = int.Parse(hb!.Split(' ')[1]);
+ budget.ShouldBe(12);
+ }
+ finally
+ {
+ coordinator.Stop();
+ }
+ }
+
+ [Fact]
+ public void Status_ReturnsCorrectSummary()
+ {
+ using var coordinator = new BuildCoordinator(12, maxConcurrentBuilds: 2);
+ coordinator.Start();
+
+ try
+ {
+ SendRawCommand("REGISTER build-1 6");
+ SendRawCommand("REGISTER build-2 6");
+ SendRawCommand("REGISTER build-3 6"); // Will be queued
+
+ // Acknowledge so build-2 promotes
+ SendRawCommand("HEARTBEAT build-1");
+
+ string? status = SendRawCommand("STATUS");
+ status.ShouldNotBeNull();
+ status.ShouldContain("budget=12");
+ status.ShouldContain("max=2");
+ }
+ finally
+ {
+ coordinator.Stop();
+ }
+ }
+
+ [Fact]
+ public void Shutdown_StopsCoordinator()
+ {
+ using var coordinator = new BuildCoordinator(12, maxConcurrentBuilds: 3);
+ coordinator.Start();
+
+ string? response = SendRawCommand("SHUTDOWN");
+ response.ShouldBe("OK");
+
+ // Coordinator should stop — WaitForShutdown should return
+ coordinator.WaitForShutdown();
+ }
+
+ [Fact]
+ public void UnknownCommand_ReturnsError()
+ {
+ using var coordinator = new BuildCoordinator(12, maxConcurrentBuilds: 3);
+ coordinator.Start();
+
+ try
+ {
+ string? response = SendRawCommand("INVALID_CMD");
+ response.ShouldNotBeNull();
+ response.ShouldStartWith("ERR");
+ }
+ finally
+ {
+ coordinator.Stop();
+ }
+ }
+
+ [Fact]
+ public void Register_WithInvalidArgs_ReturnsError()
+ {
+ using var coordinator = new BuildCoordinator(12, maxConcurrentBuilds: 3);
+ coordinator.Start();
+
+ try
+ {
+ string? response = SendRawCommand("REGISTER");
+ response.ShouldNotBeNull();
+ response.ShouldStartWith("ERR");
+ }
+ finally
+ {
+ coordinator.Stop();
+ }
+ }
+
+ [Fact]
+ public void EpochGating_PreventsPromotionBeforeAcknowledgment()
+ {
+ using var coordinator = new BuildCoordinator(12, maxConcurrentBuilds: 3);
+ coordinator.Start();
+
+ try
+ {
+ SendRawCommand("REGISTER build-1 12");
+ SendRawCommand("REGISTER build-2 12"); // Queued — epoch bumped
+
+ // build-2 heartbeats BEFORE build-1 acknowledges — should still be queued
+ string? hb2 = SendRawCommand("HEARTBEAT build-2");
+ hb2.ShouldStartWith("QUEUED ");
+
+ // Now build-1 acknowledges via heartbeat
+ SendRawCommand("HEARTBEAT build-1");
+
+ // build-2 should now be promoted
+ string? hb2After = SendRawCommand("HEARTBEAT build-2");
+ hb2After.ShouldStartWith("OK ");
+ }
+ finally
+ {
+ coordinator.Stop();
+ }
+ }
+
+ [Fact]
+ public void MaxConcurrency_EnforcesLimit()
+ {
+ using var coordinator = new BuildCoordinator(12, maxConcurrentBuilds: 2);
+ coordinator.Start();
+
+ try
+ {
+ SendRawCommand("REGISTER build-1 6");
+ SendRawCommand("REGISTER build-2 6");
+
+ // Acknowledge so build-2 promotes
+ SendRawCommand("HEARTBEAT build-1");
+
+ // Third build should be queued (max=2)
+ SendRawCommand("REGISTER build-3 6");
+ string? hb3 = SendRawCommand("HEARTBEAT build-3");
+ hb3.ShouldStartWith("QUEUED ");
+
+ // Even after all heartbeats, build-3 stays queued
+ SendRawCommand("HEARTBEAT build-1");
+ SendRawCommand("HEARTBEAT build-2");
+ hb3 = SendRawCommand("HEARTBEAT build-3");
+ hb3.ShouldStartWith("QUEUED ");
+ }
+ finally
+ {
+ coordinator.Stop();
+ }
+ }
+
+ [Fact]
+ public void FairShare_DistributesBudgetEvenly()
+ {
+ using var coordinator = new BuildCoordinator(12, maxConcurrentBuilds: 3);
+ coordinator.Start();
+
+ try
+ {
+ // Register 3 builds and get them all active
+ SendRawCommand("REGISTER build-1 12");
+ SendRawCommand("REGISTER build-2 12");
+ SendRawCommand("HEARTBEAT build-1"); // Acknowledge for build-2 promotion
+ SendRawCommand("REGISTER build-3 12");
+ SendRawCommand("HEARTBEAT build-1"); // Acknowledge for build-3
+ SendRawCommand("HEARTBEAT build-2"); // Acknowledge for build-3
+
+ // All three should get 4 nodes each (12 / 3)
+ string? hb1 = SendRawCommand("HEARTBEAT build-1");
+ string? hb2 = SendRawCommand("HEARTBEAT build-2");
+ string? hb3 = SendRawCommand("HEARTBEAT build-3");
+
+ int b1 = int.Parse(hb1!.Split(' ')[1]);
+ int b2 = int.Parse(hb2!.Split(' ')[1]);
+ int b3 = int.Parse(hb3!.Split(' ')[1]);
+
+ b1.ShouldBe(4);
+ b2.ShouldBe(4);
+ b3.ShouldBe(4);
+ }
+ finally
+ {
+ coordinator.Stop();
+ }
+ }
+
+ [Fact]
+ public void FairShare_CapsAtRequestedNodes()
+ {
+ using var coordinator = new BuildCoordinator(12, maxConcurrentBuilds: 3);
+ coordinator.Start();
+
+ try
+ {
+ // build-1 only wants 2 nodes
+ SendRawCommand("REGISTER build-1 2");
+ SendRawCommand("REGISTER build-2 12");
+ SendRawCommand("HEARTBEAT build-1");
+
+ // build-1 should get 2 (capped at requested), build-2 gets 6 (fair share)
+ string? hb1 = SendRawCommand("HEARTBEAT build-1");
+ string? hb2 = SendRawCommand("HEARTBEAT build-2");
+
+ int b1 = int.Parse(hb1!.Split(' ')[1]);
+ int b2 = int.Parse(hb2!.Split(' ')[1]);
+
+ b1.ShouldBe(2);
+ b2.ShouldBe(6);
+ }
+ finally
+ {
+ coordinator.Stop();
+ }
+ }
+
+ #endregion
+
+ #region Client Tests
+
+ [Fact]
+ public void Client_NoCoordinator_TryRegisterReturnsFalse()
+ {
+ // This test verifies that when no coordinator is listening on the expected pipe,
+ // the client gracefully returns false. We use a custom pipe name that doesn't exist.
+ // Since BuildCoordinatorClient always uses GetPipeName(), we can't easily redirect it.
+ // Instead, verify that SendCommand returns null for a nonexistent pipe by checking
+ // that a raw connection to a bogus pipe fails.
+ string bogusPipe = NativeMethodsShared.IsUnixLike
+ ? $"/tmp/MSBuild-NoPipe-{Guid.NewGuid():N}"
+ : $"MSBuild-NoPipe-{Guid.NewGuid():N}";
+
+ bool connected = false;
+ try
+ {
+ using var client = new System.IO.Pipes.NamedPipeClientStream(".", bogusPipe, PipeDirection.InOut, PipeOptions.CurrentUserOnly);
+ client.Connect(500);
+ connected = true;
+ }
+ catch (TimeoutException)
+ {
+ // Expected — no server
+ }
+ catch (IOException)
+ {
+ // Expected — no server
+ }
+
+ connected.ShouldBeFalse("Connection to nonexistent pipe should fail");
+ }
+
+ [Fact]
+ public void Client_RegistersWithCoordinator()
+ {
+ using var coordinator = new BuildCoordinator(12, maxConcurrentBuilds: 3);
+ coordinator.Start();
+
+ try
+ {
+ using var client = new BuildCoordinatorClient();
+ bool registered = client.TryRegister(6, out int granted);
+
+ registered.ShouldBeTrue();
+ client.IsConnected.ShouldBeTrue();
+ granted.ShouldBe(6);
+ client.GrantedNodes.ShouldBe(6);
+ }
+ finally
+ {
+ coordinator.Stop();
+ }
+ }
+
+ [Fact]
+ public void Client_HeartbeatUpdatesbudget()
+ {
+ using var coordinator = new BuildCoordinator(12, maxConcurrentBuilds: 3);
+ coordinator.Start();
+
+ try
+ {
+ using var client1 = new BuildCoordinatorClient();
+ client1.TryRegister(12, out _);
+
+ int? budgetUpdate = null;
+ using var budgetChanged = new ManualResetEventSlim(false);
+
+ client1.StartHeartbeat(newBudget =>
+ {
+ budgetUpdate = newBudget;
+ budgetChanged.Set();
+ });
+
+ // Register a second build via raw protocol to trigger rebalance
+ SendRawCommand($"REGISTER second-build 12");
+ // Heartbeat will fire every 2s and get the new budget
+ // Wait up to 10s for the budget change callback
+ budgetChanged.Wait(TimeSpan.FromSeconds(10)).ShouldBeTrue("Budget change callback should fire");
+ budgetUpdate.ShouldBe(6); // 12 / 2 = 6
+ }
+ finally
+ {
+ coordinator.Stop();
+ }
+ }
+
+ [Fact]
+ public void Client_UnregisterCleansUp()
+ {
+ using var coordinator = new BuildCoordinator(12, maxConcurrentBuilds: 3);
+ coordinator.Start();
+
+ try
+ {
+ var client = new BuildCoordinatorClient();
+ client.TryRegister(6, out _);
+ client.IsConnected.ShouldBeTrue();
+
+ client.Dispose(); // Triggers Unregister
+
+ // Coordinator should show 0 active
+ string? status = SendRawCommand("STATUS");
+ status!.ShouldContain("active=0");
+ }
+ finally
+ {
+ coordinator.Stop();
+ }
+ }
+
+ [Fact]
+ public void Client_QueuedBuild_BlocksUntilPromoted()
+ {
+ using var coordinator = new BuildCoordinator(12, maxConcurrentBuilds: 1);
+ coordinator.Start();
+
+ try
+ {
+ // First build registers immediately
+ using var client1 = new BuildCoordinatorClient();
+ client1.TryRegister(6, out _);
+
+ // Start heartbeats so client1 acknowledges epochs
+ client1.StartHeartbeat(_ => { });
+
+ // Second build should block in queue
+ var queuePositions = new List();
+ using var client2 = new BuildCoordinatorClient();
+
+ var registerTask = Task.Run(() =>
+ {
+ return client2.TryRegister(6, out _, onQueuePositionChanged: (pos, total, wait) =>
+ {
+ _output.WriteLine($"Queue position: {pos}/{total}, waiting {wait}s");
+ queuePositions.Add(pos);
+ });
+ });
+
+ // Give client2 time to register and start heartbeating in queue
+ Thread.Sleep(3000);
+ registerTask.IsCompleted.ShouldBeFalse("Build should still be queued");
+
+ // Unregister first build — should promote second
+ client1.Dispose();
+
+ // Second build should complete registration
+ bool registered = registerTask.Wait(TimeSpan.FromSeconds(15))
+ ? registerTask.Result
+ : false;
+
+ registered.ShouldBeTrue("Queued build should be promoted after first build unregisters");
+ client2.IsConnected.ShouldBeTrue();
+ queuePositions.ShouldNotBeEmpty();
+ }
+ finally
+ {
+ coordinator.Stop();
+ }
+ }
+
+ [Fact]
+ public void Client_QueuedBuild_CancellationUnregisters()
+ {
+ using var coordinator = new BuildCoordinator(12, maxConcurrentBuilds: 1);
+ coordinator.Start();
+
+ try
+ {
+ using var client1 = new BuildCoordinatorClient();
+ client1.TryRegister(6, out _);
+
+ using var cts = new CancellationTokenSource();
+ using var client2 = new BuildCoordinatorClient();
+
+ var registerTask = Task.Run(() =>
+ client2.TryRegister(6, out _, ct: cts.Token));
+
+ // Let it enter queue
+ Thread.Sleep(3000);
+
+ // Cancel
+ cts.Cancel();
+
+ bool registered = registerTask.Wait(TimeSpan.FromSeconds(10))
+ ? registerTask.Result
+ : true; // If timed out, fail
+
+ registered.ShouldBeFalse("Cancelled registration should return false");
+ }
+ finally
+ {
+ coordinator.Stop();
+ }
+ }
+
+ #endregion
+
+ #region GetPipeName Tests
+
+ [Fact]
+ public void GetPipeName_ContainsUsername()
+ {
+ string pipeName = BuildCoordinator.GetPipeName();
+ pipeName.ShouldContain(Environment.UserName);
+ }
+
+ [Fact]
+ public void GetPipeName_OnUnix_IsAbsolutePath()
+ {
+ if (!NativeMethodsShared.IsUnixLike)
+ {
+ return; // Skip on Windows
+ }
+
+ string pipeName = BuildCoordinator.GetPipeName();
+ pipeName.ShouldStartWith("/tmp/");
+ }
+
+ #endregion
+
+ #region Staleness Reaper Tests
+
+ [Fact]
+ public void StalenessReaper_ReapsDeadBuild()
+ {
+ using var coordinator = new BuildCoordinator(12, maxConcurrentBuilds: 3);
+ coordinator.Start();
+
+ try
+ {
+ // Register a build with a PID that definitely doesn't exist.
+ // Build ID format is "{PID}-{ticks}" — use PID 99999999
+ string deadBuildId = "99999999-123456789";
+ string? r = SendRawCommand($"REGISTER {deadBuildId} 6");
+ r.ShouldStartWith("OK ");
+
+ // Verify it's active
+ string? status1 = SendRawCommand("STATUS");
+ status1!.ShouldContain("active=1");
+
+ // Wait for the staleness reaper to detect it (10s stale + 5s reap interval)
+ // The reaper checks every 5s and requires heartbeat to be stale for 10s
+ Thread.Sleep(18000);
+
+ // Build should have been reaped
+ string? status2 = SendRawCommand("STATUS");
+ status2!.ShouldContain("active=0");
+ }
+ finally
+ {
+ coordinator.Stop();
+ }
+ }
+
+ [Fact]
+ public void StalenessReaper_DoesNotReapLiveBuild()
+ {
+ using var coordinator = new BuildCoordinator(12, maxConcurrentBuilds: 3);
+ coordinator.Start();
+
+ try
+ {
+ // Register with current PID — process is definitely alive
+ string liveBuildId = $"{Environment.ProcessId}-{DateTime.UtcNow.Ticks}";
+ SendRawCommand($"REGISTER {liveBuildId} 6");
+
+ // Wait past the stale threshold but keep heartbeating
+ for (int i = 0; i < 4; i++)
+ {
+ Thread.Sleep(3000);
+ SendRawCommand($"HEARTBEAT {liveBuildId}");
+ }
+
+ // Build should still be active
+ string? status = SendRawCommand("STATUS");
+ status!.ShouldContain("active=1");
+ }
+ finally
+ {
+ coordinator.Stop();
+ }
+ }
+
+ [Fact]
+ public void StalenessReaper_PromotesQueuedAfterReap()
+ {
+ using var coordinator = new BuildCoordinator(12, maxConcurrentBuilds: 1);
+ coordinator.Start();
+
+ try
+ {
+ // First build — dead PID
+ string deadBuildId = "99999999-111111";
+ SendRawCommand($"REGISTER {deadBuildId} 6");
+
+ // Second build — queued (max concurrent = 1)
+ string liveBuildId = $"{Environment.ProcessId}-{DateTime.UtcNow.Ticks}";
+ SendRawCommand($"REGISTER {liveBuildId} 6");
+
+ // Verify: 1 active, 1 queued
+ string? status1 = SendRawCommand("STATUS");
+ status1!.ShouldContain("active=1");
+
+ // Wait for reaper to kill the dead build and promote the queued one
+ Thread.Sleep(18000);
+
+ // Live build should now be active
+ string? hb = SendRawCommand($"HEARTBEAT {liveBuildId}");
+ hb.ShouldStartWith("OK ");
+
+ string? status2 = SendRawCommand("STATUS");
+ status2!.ShouldContain("active=1");
+ }
+ finally
+ {
+ coordinator.Stop();
+ }
+ }
+
+ #endregion
+
+ #region Concurrent Connection Tests
+
+ [Fact]
+ public void ConcurrentRegistrations_AllSucceed()
+ {
+ using var coordinator = new BuildCoordinator(24, maxConcurrentBuilds: 5);
+ coordinator.Start();
+
+ try
+ {
+ int successCount = 0;
+ var tasks = new Task[5];
+
+ for (int i = 0; i < 5; i++)
+ {
+ int buildNum = i;
+ tasks[i] = Task.Run(() =>
+ {
+ string? response = SendRawCommand($"REGISTER concurrent-{buildNum} 6");
+ if (response != null && (response.StartsWith("OK ") || response.StartsWith("QUEUED ")))
+ {
+ Interlocked.Increment(ref successCount);
+ }
+ });
+ }
+
+ Task.WaitAll(tasks, TimeSpan.FromSeconds(15));
+ successCount.ShouldBe(5);
+ }
+ finally
+ {
+ coordinator.Stop();
+ }
+ }
+
+ #endregion
+
+ #region Helpers
+
+ ///
+ /// Send a raw command to the coordinator using the well-known pipe name.
+ ///
+ private string? SendRawCommand(string command)
+ {
+ string pipeName = BuildCoordinator.GetPipeName();
+
+ for (int attempt = 0; attempt < 3; attempt++)
+ {
+ try
+ {
+ using var client = new NamedPipeClientStream(".", pipeName, PipeDirection.InOut, PipeOptions.CurrentUserOnly);
+ client.Connect(2000);
+
+ using var writer = new StreamWriter(client, leaveOpen: true) { AutoFlush = true };
+ using var reader = new StreamReader(client, leaveOpen: true);
+
+ writer.WriteLine(command);
+ return reader.ReadLine();
+ }
+ catch (TimeoutException)
+ {
+ if (attempt < 2)
+ {
+ Thread.Sleep(500);
+ }
+ }
+ catch (IOException)
+ {
+ if (attempt < 2)
+ {
+ Thread.Sleep(500);
+ }
+ }
+ }
+
+ return null;
+ }
+
+ #endregion
+ }
+}
+
+#endif
diff --git a/src/Build.UnitTests/BackEnd/HashBasedPipeNaming_Tests.cs b/src/Build.UnitTests/BackEnd/HashBasedPipeNaming_Tests.cs
new file mode 100644
index 00000000000..d996b66f3c3
--- /dev/null
+++ b/src/Build.UnitTests/BackEnd/HashBasedPipeNaming_Tests.cs
@@ -0,0 +1,245 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System;
+using System.IO;
+using Microsoft.Build.Internal;
+using Microsoft.Build.Shared;
+using Shouldly;
+using Xunit;
+
+#nullable disable
+
+namespace Microsoft.Build.UnitTests
+{
+ ///
+ /// Tests for hash-based pipe naming in NamedPipeUtil and Handshake.ComputeHash.
+ ///
+ public class HashBasedPipeNaming_Tests
+ {
+ #region ComputeHash Tests
+
+ [Fact]
+ public void ComputeHash_ReturnsDeterministicValue()
+ {
+ var handshake = new Handshake(HandshakeOptions.NodeReuse);
+ string hash1 = handshake.ComputeHash();
+ string hash2 = handshake.ComputeHash();
+
+ hash1.ShouldNotBeNullOrEmpty();
+ hash2.ShouldNotBeNullOrEmpty();
+ hash1.ShouldBe(hash2);
+ }
+
+ [Fact]
+ public void ComputeHash_SameOptionsSameHash()
+ {
+ var h1 = new Handshake(HandshakeOptions.NodeReuse);
+ var h2 = new Handshake(HandshakeOptions.NodeReuse);
+
+ h1.ComputeHash().ShouldBe(h2.ComputeHash());
+ }
+
+ [Fact]
+ public void ComputeHash_DifferentOptionsYieldDifferentHash()
+ {
+ var h1 = new Handshake(HandshakeOptions.NodeReuse);
+ var h2 = new Handshake(HandshakeOptions.None);
+
+ h1.ComputeHash().ShouldNotBe(h2.ComputeHash());
+ }
+
+ [Fact]
+ public void ComputeHash_NoPaddingOrSlashes()
+ {
+ var handshake = new Handshake(HandshakeOptions.NodeReuse);
+ string hash = handshake.ComputeHash();
+
+ // Hash should be URL/filename-safe: no / or = characters
+ hash.ShouldNotContain("/");
+ hash.ShouldNotContain("=");
+ }
+
+ [Fact]
+ public void ComputeHash_IsCached()
+ {
+ var handshake = new Handshake(HandshakeOptions.NodeReuse);
+ string hash1 = handshake.ComputeHash();
+ string hash2 = handshake.ComputeHash();
+
+ // Should be the exact same object reference (cached)
+ ReferenceEquals(hash1, hash2).ShouldBeTrue();
+ }
+
+ #endregion
+
+ #region GetHashBasedPipeName Tests
+
+ [Fact]
+ public void GetHashBasedPipeName_ContainsHashAndPid()
+ {
+ string hash = "abc123";
+ int pid = 42;
+ string pipeName = NamedPipeUtil.GetHashBasedPipeName(hash, pid);
+
+ pipeName.ShouldContain("MSBuild-abc123-42");
+ }
+
+ [Fact]
+ public void GetHashBasedPipeName_DefaultsToCurrentPid()
+ {
+ string hash = "testhash";
+ string pipeName = NamedPipeUtil.GetHashBasedPipeName(hash);
+
+ int currentPid = EnvironmentUtilities.CurrentProcessId;
+ pipeName.ShouldContain($"MSBuild-testhash-{currentPid}");
+ }
+
+ [Fact]
+ public void GetHashBasedPipeName_OnUnix_IsAbsolutePath()
+ {
+ if (!NativeMethodsShared.IsUnixLike)
+ {
+ return;
+ }
+
+ string pipeName = NamedPipeUtil.GetHashBasedPipeName("hash", 123);
+ pipeName.ShouldStartWith("/tmp/");
+ }
+
+ #endregion
+
+ #region FindNodesByHandshakeHash Tests
+
+ [Fact]
+ public void FindNodesByHandshakeHash_ReturnsEmptyOnWindows()
+ {
+ if (NativeMethodsShared.IsUnixLike)
+ {
+ return; // Only test on Windows
+ }
+
+ var pids = NamedPipeUtil.FindNodesByHandshakeHash("nonexistent");
+ pids.ShouldBeEmpty();
+ }
+
+ [Fact]
+ public void FindNodesByHandshakeHash_FindsMatchingPipeFiles()
+ {
+ if (!NativeMethodsShared.IsUnixLike)
+ {
+ return; // Only works on Unix where pipes are files
+ }
+
+ string testHash = $"test-{Guid.NewGuid():N}";
+
+ // Create fake pipe files in /tmp
+ string pipe1 = $"/tmp/MSBuild-{testHash}-1001";
+ string pipe2 = $"/tmp/MSBuild-{testHash}-1002";
+ string pipeOther = $"/tmp/MSBuild-otherhash-9999";
+
+ try
+ {
+ File.WriteAllText(pipe1, "");
+ File.WriteAllText(pipe2, "");
+ File.WriteAllText(pipeOther, "");
+
+ var pids = NamedPipeUtil.FindNodesByHandshakeHash(testHash);
+
+ pids.ShouldContain(1001);
+ pids.ShouldContain(1002);
+ pids.ShouldNotContain(9999);
+ }
+ finally
+ {
+ File.Delete(pipe1);
+ File.Delete(pipe2);
+ File.Delete(pipeOther);
+ }
+ }
+
+ [Fact]
+ public void FindNodesByHandshakeHash_IgnoresMalformedFileNames()
+ {
+ if (!NativeMethodsShared.IsUnixLike)
+ {
+ return;
+ }
+
+ string testHash = $"test-{Guid.NewGuid():N}";
+ string pipeGood = $"/tmp/MSBuild-{testHash}-5555";
+ string pipeBad = $"/tmp/MSBuild-{testHash}-notanumber";
+
+ try
+ {
+ File.WriteAllText(pipeGood, "");
+ File.WriteAllText(pipeBad, "");
+
+ var pids = NamedPipeUtil.FindNodesByHandshakeHash(testHash);
+
+ pids.ShouldContain(5555);
+ pids.Count.ShouldBe(1);
+ }
+ finally
+ {
+ File.Delete(pipeGood);
+ File.Delete(pipeBad);
+ }
+ }
+
+ [Fact]
+ public void FindNodesByHandshakeHash_ReturnsEmptyWhenNoMatches()
+ {
+ if (!NativeMethodsShared.IsUnixLike)
+ {
+ return;
+ }
+
+ var pids = NamedPipeUtil.FindNodesByHandshakeHash($"nopipes-{Guid.NewGuid():N}");
+ pids.ShouldBeEmpty();
+ }
+
+ #endregion
+
+ #region SessionId Fix Tests
+
+ [Fact]
+ public void Handshake_OnUnix_SessionIdIsZero()
+ {
+ if (!NativeMethodsShared.IsUnixLike)
+ {
+ return;
+ }
+
+ // Two handshakes created from different contexts should have the same
+ // session ID (0) on Unix, enabling cross-terminal node reuse.
+ var h1 = new Handshake(HandshakeOptions.NodeReuse);
+ var h2 = new Handshake(HandshakeOptions.NodeReuse);
+
+ // Same handshake key means same session ID was used
+ h1.GetKey().ShouldBe(h2.GetKey());
+ }
+
+ [Fact]
+ public void Handshake_SessionIdComponent_IsZeroOnUnix()
+ {
+ if (!NativeMethodsShared.IsUnixLike)
+ {
+ return;
+ }
+
+ var handshake = new Handshake(HandshakeOptions.NodeReuse);
+ var components = handshake.RetrieveHandshakeComponents();
+
+ // SessionId should be 0 on Unix (may be transformed by AvoidEndOfHandshakeSignal)
+ // The key representation includes the raw session id
+ string key = handshake.GetKey();
+ // Key format: "options salt major minor build private sessionId"
+ // Last component should be 0
+ string[] keyParts = key.Split(' ');
+ keyParts[keyParts.Length - 1].ShouldBe("0");
+ }
+
+ #endregion
+ }
+}
diff --git a/src/Build/BackEnd/BuildCoordinator/BuildCoordinator.cs b/src/Build/BackEnd/BuildCoordinator/BuildCoordinator.cs
new file mode 100644
index 00000000000..f5b5e786cae
--- /dev/null
+++ b/src/Build/BackEnd/BuildCoordinator/BuildCoordinator.cs
@@ -0,0 +1,688 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System;
+using System.Collections.Concurrent;
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.IO;
+using System.IO.Pipes;
+using System.Linq;
+using System.Threading;
+using System.Threading.Tasks;
+
+namespace Microsoft.Build.BackEnd
+{
+ ///
+ /// A standalone coordinator process that manages node budgets across concurrent MSBuild instances.
+ ///
+ /// Key design: heartbeat-gated promotion. When a new build registers and there are already
+ /// active builds, the new build is queued. Active builds learn their reduced budget on their
+ /// next heartbeat. Only after ALL active builds have acknowledged the reduction (via heartbeat)
+ /// is the new build promoted. This prevents temporarily exceeding the node budget.
+ ///
+ /// Protocol (line-based text over named pipe):
+ /// REGISTER buildId requestedNodes
+ /// → OK grantedNodes (first build — immediate)
+ /// → QUEUED position totalQueued (subsequent builds — wait for heartbeat gate)
+ ///
+ /// HEARTBEAT buildId
+ /// → OK grantedNodes (active build — may have new budget)
+ /// → QUEUED position totalQueued waitSec (queued build — position update)
+ ///
+ /// UNREGISTER buildId
+ /// → OK [promoted buildId] (promotes next queued build if any)
+ ///
+ /// STATUS
+ /// → Multi-line summary of active + queued builds
+ ///
+ /// SHUTDOWN
+ /// → OK
+ ///
+ public sealed class BuildCoordinator : IDisposable
+ {
+ ///
+ /// Well-known pipe name. All MSBuild instances for this user connect here.
+ /// On Unix: /tmp/MSBuild-Coordinator-{username}
+ /// On Windows: MSBuild-Coordinator-{username}
+ ///
+ internal static string GetPipeName()
+ {
+ string user = Environment.UserName;
+ string pipeName = $"MSBuild-Coordinator-{user}";
+
+ if (NativeMethodsShared.IsUnixLike)
+ {
+ return $"/tmp/{pipeName}";
+ }
+
+ return pipeName;
+ }
+
+ private readonly int _totalBudget;
+ private readonly int _maxConcurrentBuilds;
+ private readonly ConcurrentDictionary _activeBuilds = new();
+ private readonly List _queuedBuilds = new();
+ private readonly object _queueLock = new();
+ private readonly CancellationTokenSource _cts = new();
+ private Task? _listenTask;
+ private Timer? _stalenessReaper;
+ private Timer? _pipeWatchdog;
+ private CancellationTokenSource? _listenCycleCts;
+
+ ///
+ /// Epoch counter — bumped whenever the budget landscape changes and active builds
+ /// need to acknowledge their new budget before queued builds can be promoted.
+ ///
+ private int _rebalanceEpoch;
+
+ ///
+ /// If a build hasn't heartbeated in this many seconds, consider it dead.
+ ///
+ private const int StaleHeartbeatSeconds = 10;
+
+ public BuildCoordinator(int totalBudget, int maxConcurrentBuilds)
+ {
+ _totalBudget = totalBudget;
+ _maxConcurrentBuilds = maxConcurrentBuilds;
+ }
+
+ ///
+ /// Start listening for MSBuild client connections.
+ ///
+ public void Start()
+ {
+ string pipeName = GetPipeName();
+
+ // On Unix, clean up stale pipe file
+ if (NativeMethodsShared.IsUnixLike && File.Exists(pipeName))
+ {
+ File.Delete(pipeName);
+ }
+
+ Console.WriteLine($"Build Coordinator starting");
+ Console.WriteLine($" Pipe: {pipeName}");
+ Console.WriteLine($" Budget: {_totalBudget} nodes");
+ Console.WriteLine($" Max concurrent builds: {_maxConcurrentBuilds}");
+
+ _listenTask = Task.Run(() => ListenLoop(_cts.Token));
+
+ // Periodically reap builds that stopped heartbeating (crashed/killed process)
+ _stalenessReaper = new Timer(ReapStaleBuilds, null, TimeSpan.FromSeconds(5), TimeSpan.FromSeconds(5));
+
+ // Watch for pipe file deletion (e.g. by overzealous cleanup scripts)
+ if (NativeMethodsShared.IsUnixLike)
+ {
+ _pipeWatchdog = new Timer(CheckPipeHealth, null, TimeSpan.FromSeconds(2), TimeSpan.FromSeconds(2));
+ }
+ }
+
+ ///
+ /// Stop the coordinator and clean up.
+ ///
+ public void Stop()
+ {
+ _pipeWatchdog?.Dispose();
+ _pipeWatchdog = null;
+ _stalenessReaper?.Dispose();
+ _stalenessReaper = null;
+ _cts.Cancel();
+ _listenCycleCts?.Cancel();
+ _listenTask?.Wait(TimeSpan.FromSeconds(5));
+ }
+
+ public void Dispose()
+ {
+ Stop();
+ _cts.Dispose();
+ }
+
+ ///
+ /// Block until the coordinator is stopped.
+ ///
+ public void WaitForShutdown()
+ {
+ try
+ {
+ _listenTask?.Wait();
+ }
+ catch (AggregateException ex) when (ex.InnerException is OperationCanceledException)
+ {
+ }
+ }
+
+ private void ListenLoop(CancellationToken ct)
+ {
+ while (!ct.IsCancellationRequested)
+ {
+ string pipeName = GetPipeName();
+
+ // Create a per-cycle CTS linked to the main one.
+ // The pipe watchdog can cancel just this cycle to force socket recreation.
+ _listenCycleCts = CancellationTokenSource.CreateLinkedTokenSource(ct);
+ var cycleToken = _listenCycleCts.Token;
+
+ // Don't use 'using' — ownership transfers to the threadpool handler.
+#pragma warning disable CA2000 // Dispose is called in the Task.Run finally block
+ var server = new NamedPipeServerStream(
+ pipeName,
+ PipeDirection.InOut,
+ NamedPipeServerStream.MaxAllowedServerInstances,
+ PipeTransmissionMode.Byte,
+ System.IO.Pipes.PipeOptions.CurrentUserOnly);
+#pragma warning restore CA2000
+
+ try
+ {
+ server.WaitForConnectionAsync(cycleToken).Wait(cycleToken);
+ }
+ catch (OperationCanceledException) when (!ct.IsCancellationRequested)
+ {
+ server.Dispose();
+ Console.WriteLine($"[{DateTime.Now:HH:mm:ss}] Pipe file missing, recreating listener...");
+ continue;
+ }
+ catch (OperationCanceledException)
+ {
+ server.Dispose();
+ break;
+ }
+ catch (AggregateException ex) when (ex.InnerException is OperationCanceledException && !ct.IsCancellationRequested)
+ {
+ server.Dispose();
+ Console.WriteLine($"[{DateTime.Now:HH:mm:ss}] Pipe file missing, recreating listener...");
+ continue;
+ }
+ catch (AggregateException ex) when (ex.InnerException is OperationCanceledException)
+ {
+ server.Dispose();
+ break;
+ }
+
+ // Handle on threadpool — immediately loop back to accept the next client.
+ // This minimizes the gap where no listener is bound.
+ var capturedServer = server;
+ Task.Run(() =>
+ {
+ try
+ {
+ HandleConnection(capturedServer);
+ }
+ finally
+ {
+ capturedServer.Dispose();
+ }
+ }, ct);
+ }
+ }
+
+ ///
+ /// Periodic watchdog that detects if the coordinator pipe file was deleted
+ /// (e.g. by a cleanup script) and interrupts the listen loop so it recreates it.
+ ///
+ private void CheckPipeHealth(object? state)
+ {
+ string pipeName = GetPipeName();
+ if (!File.Exists(pipeName))
+ {
+ Console.WriteLine($"[{DateTime.Now:HH:mm:ss}] WARNING: Pipe file {pipeName} was deleted externally! Triggering recreation...");
+ _listenCycleCts?.Cancel();
+ }
+ }
+
+ private void HandleConnection(NamedPipeServerStream server)
+ {
+ try
+ {
+ using var reader = new StreamReader(server, leaveOpen: true);
+ using var writer = new StreamWriter(server, leaveOpen: true) { AutoFlush = true };
+
+ string? line = reader.ReadLine();
+ if (string.IsNullOrEmpty(line))
+ {
+ return;
+ }
+
+ string[] parts = line.Split(' ');
+ string command = parts[0].ToUpperInvariant();
+
+ switch (command)
+ {
+ case "REGISTER":
+ HandleRegister(parts, writer);
+ break;
+ case "HEARTBEAT":
+ HandleHeartbeat(parts, writer);
+ break;
+ case "UNREGISTER":
+ HandleUnregister(parts, writer);
+ break;
+ case "STATUS":
+ HandleStatus(writer);
+ break;
+ case "SHUTDOWN":
+ writer.WriteLine("OK");
+ _cts.Cancel();
+ break;
+ default:
+ writer.WriteLine("ERR unknown command");
+ break;
+ }
+ }
+ catch (IOException)
+ {
+ // Client disconnected
+ }
+ }
+
+ private void HandleRegister(string[] parts, StreamWriter writer)
+ {
+ if (parts.Length < 3)
+ {
+ writer.WriteLine("ERR usage: REGISTER buildId requestedNodes");
+ return;
+ }
+
+ string buildId = parts[1];
+ if (!int.TryParse(parts[2], out int requested) || requested <= 0)
+ {
+ writer.WriteLine("ERR invalid requestedNodes");
+ return;
+ }
+
+ var registration = new BuildRegistration(buildId, requested, DateTime.UtcNow);
+
+ // First build ever — activate immediately with full budget
+ if (_activeBuilds.IsEmpty)
+ {
+ _activeBuilds[buildId] = registration;
+ registration.AcknowledgedEpoch = _rebalanceEpoch;
+ int granted = CalculateBudget(buildId);
+ registration.GrantedNodes = granted;
+
+ Console.WriteLine($"[{DateTime.Now:HH:mm:ss}] REGISTER {buildId}: requested={requested} granted={granted} (first build)");
+ writer.WriteLine($"OK {granted}");
+ return;
+ }
+
+ // Subsequent builds — always queue. Bump epoch so active builds must
+ // heartbeat (acknowledge reduced budget) before this build is promoted.
+ lock (_queueLock)
+ {
+ _queuedBuilds.Add(registration);
+ _rebalanceEpoch++;
+ int position = _queuedBuilds.Count;
+ int totalQueued = _queuedBuilds.Count;
+
+ Console.WriteLine($"[{DateTime.Now:HH:mm:ss}] QUEUED {buildId}: position={position}/{totalQueued} active={_activeBuilds.Count} epoch={_rebalanceEpoch} (waiting for heartbeat gate)");
+ writer.WriteLine($"QUEUED {position} {totalQueued}");
+ }
+ }
+
+ private void HandleHeartbeat(string[] parts, StreamWriter writer)
+ {
+ if (parts.Length < 2)
+ {
+ writer.WriteLine("ERR usage: HEARTBEAT buildId");
+ return;
+ }
+
+ string buildId = parts[1];
+
+ // Check if build is active
+ if (_activeBuilds.TryGetValue(buildId, out var activeReg))
+ {
+ activeReg.LastHeartbeat = DateTime.UtcNow;
+ activeReg.AcknowledgedEpoch = _rebalanceEpoch;
+
+ int newBudget = CalculateBudget(buildId);
+ activeReg.GrantedNodes = newBudget;
+ writer.WriteLine($"OK {newBudget}");
+
+ // After acknowledging, check if all active builds are caught up
+ // and we can promote queued builds
+ TryPromotePending();
+ return;
+ }
+
+ // Check if build is queued
+ lock (_queueLock)
+ {
+ int index = _queuedBuilds.FindIndex(b => b.BuildId == buildId);
+ if (index >= 0)
+ {
+ var queuedReg = _queuedBuilds[index];
+ queuedReg.LastHeartbeat = DateTime.UtcNow;
+ int position = index + 1;
+ int totalQueued = _queuedBuilds.Count;
+ int waitSec = (int)(DateTime.UtcNow - queuedReg.QueuedAt).TotalSeconds;
+ writer.WriteLine($"QUEUED {position} {totalQueued} {waitSec}");
+ return;
+ }
+ }
+
+ // Unknown build — return full budget (fallback)
+ writer.WriteLine($"OK {_totalBudget}");
+ }
+
+ private void HandleUnregister(string[] parts, StreamWriter writer)
+ {
+ if (parts.Length < 2)
+ {
+ writer.WriteLine("ERR usage: UNREGISTER buildId");
+ return;
+ }
+
+ string buildId = parts[1];
+
+ // Remove from active builds
+ bool wasActive = _activeBuilds.TryRemove(buildId, out _);
+
+ // Also remove from queue in case it was queued
+ if (!wasActive)
+ {
+ lock (_queueLock)
+ {
+ _queuedBuilds.RemoveAll(b => b.BuildId == buildId);
+ }
+ }
+
+ Console.WriteLine($"[{DateTime.Now:HH:mm:ss}] UNREGISTER {buildId}: active={_activeBuilds.Count} queued={_queuedBuilds.Count}");
+
+ // When a build leaves, remaining builds get MORE budget (safe direction).
+ // Promote immediately if there's a slot — the promoted build gets the correct
+ // share, and existing builds will learn their increased budget on next heartbeat.
+ string? promoted = null;
+ if (wasActive && _activeBuilds.Count < _maxConcurrentBuilds)
+ {
+ lock (_queueLock)
+ {
+ if (_queuedBuilds.Count > 0)
+ {
+ var next = _queuedBuilds[0];
+ _queuedBuilds.RemoveAt(0);
+ next.PromotedAt = DateTime.UtcNow;
+ next.AcknowledgedEpoch = _rebalanceEpoch;
+ _activeBuilds[next.BuildId] = next;
+ int granted = CalculateBudget(next.BuildId);
+ next.GrantedNodes = granted;
+ promoted = next.BuildId;
+
+ Console.WriteLine($"[{DateTime.Now:HH:mm:ss}] PROMOTED {next.BuildId}: granted={granted} waited={(next.PromotedAt.Value - next.QueuedAt):mm\\:ss} active={_activeBuilds.Count} queued={_queuedBuilds.Count}");
+
+ // If more queued, bump epoch for next round
+ if (_queuedBuilds.Count > 0)
+ {
+ _rebalanceEpoch++;
+ }
+ }
+ }
+ }
+
+ if (promoted != null)
+ {
+ writer.WriteLine($"OK promoted {promoted}");
+ }
+ else
+ {
+ writer.WriteLine("OK");
+ }
+
+ RebalanceAll();
+ }
+
+ ///
+ /// Promote queued builds if:
+ /// 1. There's capacity (active < max concurrent)
+ /// 2. All active builds have acknowledged the current rebalance epoch
+ /// (so they've received their reduced budget via heartbeat)
+ /// Promotes one build at a time, bumping epoch after each so the newly
+ /// promoted build must also heartbeat before the next one is promoted.
+ ///
+ private void TryPromotePending()
+ {
+ if (_activeBuilds.Count >= _maxConcurrentBuilds)
+ {
+ return;
+ }
+
+ // Check that ALL active builds have acknowledged the current epoch
+ int currentEpoch = _rebalanceEpoch;
+ foreach (var kvp in _activeBuilds)
+ {
+ if (kvp.Value.AcknowledgedEpoch < currentEpoch)
+ {
+ return; // Not all caught up yet
+ }
+ }
+
+ lock (_queueLock)
+ {
+ if (_queuedBuilds.Count == 0)
+ {
+ return;
+ }
+
+ // Promote one build
+ var next = _queuedBuilds[0];
+ _queuedBuilds.RemoveAt(0);
+
+ next.PromotedAt = DateTime.UtcNow;
+ next.AcknowledgedEpoch = currentEpoch; // It starts caught up
+ _activeBuilds[next.BuildId] = next;
+
+ int granted = CalculateBudget(next.BuildId);
+ next.GrantedNodes = granted;
+
+ Console.WriteLine($"[{DateTime.Now:HH:mm:ss}] PROMOTED {next.BuildId}: granted={granted} waited={(next.PromotedAt.Value - next.QueuedAt):mm\\:ss} active={_activeBuilds.Count} queued={_queuedBuilds.Count}");
+
+ // If more queued, bump epoch — existing active builds must heartbeat
+ // their new (further-reduced) budget before the next promotion
+ if (_queuedBuilds.Count > 0)
+ {
+ _rebalanceEpoch++;
+ Console.WriteLine($"[{DateTime.Now:HH:mm:ss}] Epoch bumped to {_rebalanceEpoch} — {_queuedBuilds.Count} still queued");
+ }
+ }
+
+ RebalanceAll();
+ }
+
+ private void HandleStatus(StreamWriter writer)
+ {
+ int queueCount;
+ lock (_queueLock) { queueCount = _queuedBuilds.Count; }
+
+ writer.WriteLine($"OK budget={_totalBudget} active={_activeBuilds.Count} queued={queueCount} max={_maxConcurrentBuilds} epoch={_rebalanceEpoch}");
+
+ if (!_activeBuilds.IsEmpty)
+ {
+ writer.WriteLine("Active:");
+ foreach (var kvp in _activeBuilds)
+ {
+ var reg = kvp.Value;
+ string ack = reg.AcknowledgedEpoch >= _rebalanceEpoch ? "yes" : "no";
+ writer.WriteLine($" {reg.BuildId}: granted={reg.GrantedNodes} requested={reg.RequestedNodes} epoch_ack={ack} age={DateTime.UtcNow - reg.RegisteredAt:mm\\:ss}");
+ }
+ }
+
+ lock (_queueLock)
+ {
+ if (_queuedBuilds.Count > 0)
+ {
+ writer.WriteLine("Queued:");
+ for (int i = 0; i < _queuedBuilds.Count; i++)
+ {
+ var reg = _queuedBuilds[i];
+ int waitSec = (int)(DateTime.UtcNow - reg.QueuedAt).TotalSeconds;
+ writer.WriteLine($" #{i + 1} {reg.BuildId}: requested={reg.RequestedNodes} waiting={waitSec}s");
+ }
+ }
+ }
+ }
+
+ private int CalculateBudget(string buildId)
+ {
+ int activeCount = _activeBuilds.Count;
+ if (activeCount == 0)
+ {
+ return _totalBudget;
+ }
+
+ // Account for queued builds that will be promoted soon.
+ // This way active builds pre-shrink to make room.
+ int pendingCount;
+ lock (_queueLock)
+ {
+ pendingCount = Math.Min(_queuedBuilds.Count, _maxConcurrentBuilds - activeCount);
+ pendingCount = Math.Max(0, pendingCount);
+ }
+
+ int totalBuilds = activeCount + pendingCount;
+ int fairShare = Math.Max(1, _totalBudget / totalBuilds);
+
+ // But don't exceed what the build originally requested
+ if (_activeBuilds.TryGetValue(buildId, out var registration))
+ {
+ return Math.Min(fairShare, registration.RequestedNodes);
+ }
+
+ return fairShare;
+ }
+
+ private void RebalanceAll()
+ {
+ foreach (var kvp in _activeBuilds)
+ {
+ kvp.Value.GrantedNodes = CalculateBudget(kvp.Key);
+ }
+
+ if (!_activeBuilds.IsEmpty)
+ {
+ var summary = string.Join(", ", _activeBuilds.Select(b => $"{b.Key}={b.Value.GrantedNodes}"));
+ Console.WriteLine($"[{DateTime.Now:HH:mm:ss}] Rebalanced: {summary}");
+ }
+ }
+
+ ///
+ /// Periodic timer callback that removes builds whose process has exited.
+ /// Only removes if heartbeat is stale AND the PID is no longer running.
+ ///
+ private void ReapStaleBuilds(object? state)
+ {
+ var now = DateTime.UtcNow;
+ bool anyReaped = false;
+
+ // Check active builds
+ foreach (var kvp in _activeBuilds)
+ {
+ var reg = kvp.Value;
+ double staleSec = (now - reg.LastHeartbeat).TotalSeconds;
+
+ if (staleSec < StaleHeartbeatSeconds)
+ {
+ continue; // Recent heartbeat, still healthy
+ }
+
+ // Heartbeat is stale — check if the process is actually dead
+ if (IsProcessAlive(kvp.Key))
+ {
+ continue; // Process still running, just slow to heartbeat
+ }
+
+ // Process is dead — reap it
+ if (_activeBuilds.TryRemove(kvp.Key, out _))
+ {
+ Console.WriteLine($"[{DateTime.Now:HH:mm:ss}] REAPED {kvp.Key}: process dead, stale {staleSec:F0}s");
+ anyReaped = true;
+ }
+ }
+
+ // Check queued builds too
+ lock (_queueLock)
+ {
+ for (int i = _queuedBuilds.Count - 1; i >= 0; i--)
+ {
+ var reg = _queuedBuilds[i];
+ double staleSec = (now - reg.LastHeartbeat).TotalSeconds;
+
+ if (staleSec < StaleHeartbeatSeconds)
+ {
+ continue;
+ }
+
+ if (IsProcessAlive(reg.BuildId))
+ {
+ continue;
+ }
+
+ Console.WriteLine($"[{DateTime.Now:HH:mm:ss}] REAPED (queued) {reg.BuildId}: process dead, stale {staleSec:F0}s");
+ _queuedBuilds.RemoveAt(i);
+ anyReaped = true;
+ }
+ }
+
+ if (anyReaped)
+ {
+ // Try to promote queued builds into newly opened slots
+ TryPromotePending();
+ RebalanceAll();
+ }
+ }
+
+ ///
+ /// Extract PID from build ID (format: "{PID}-{ticks}") and check if the process is alive.
+ ///
+ private static bool IsProcessAlive(string buildId)
+ {
+ int dashIndex = buildId.IndexOf('-');
+ if (dashIndex <= 0)
+ {
+ return false; // Can't parse — assume dead
+ }
+
+ if (!int.TryParse(buildId.AsSpan(0, dashIndex), out int pid))
+ {
+ return false;
+ }
+
+ try
+ {
+ var process = Process.GetProcessById(pid);
+ return !process.HasExited;
+ }
+ catch (ArgumentException)
+ {
+ // Process doesn't exist
+ return false;
+ }
+ catch (InvalidOperationException)
+ {
+ return false;
+ }
+ }
+
+ private sealed class BuildRegistration
+ {
+ internal string BuildId { get; }
+ internal int RequestedNodes { get; }
+ internal int GrantedNodes { get; set; }
+ internal DateTime RegisteredAt { get; }
+ internal DateTime QueuedAt { get; }
+ internal DateTime? PromotedAt { get; set; }
+ internal DateTime LastHeartbeat { get; set; }
+ internal int AcknowledgedEpoch { get; set; }
+
+ internal BuildRegistration(string buildId, int requestedNodes, DateTime registeredAt)
+ {
+ BuildId = buildId;
+ RequestedNodes = requestedNodes;
+ GrantedNodes = requestedNodes;
+ RegisteredAt = registeredAt;
+ QueuedAt = registeredAt;
+ LastHeartbeat = registeredAt;
+ AcknowledgedEpoch = -1; // Not yet acknowledged
+ }
+ }
+ }
+}
diff --git a/src/Build/BackEnd/BuildCoordinator/BuildCoordinatorClient.cs b/src/Build/BackEnd/BuildCoordinator/BuildCoordinatorClient.cs
new file mode 100644
index 00000000000..aad7333d6c1
--- /dev/null
+++ b/src/Build/BackEnd/BuildCoordinator/BuildCoordinatorClient.cs
@@ -0,0 +1,217 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System;
+using System.IO;
+using System.IO.Pipes;
+using System.Threading;
+
+namespace Microsoft.Build.BackEnd
+{
+ ///
+ /// Client used by BuildManager to communicate with an external BuildCoordinator process.
+ /// If no coordinator is running, all operations gracefully no-op and the build runs with
+ /// its original MaxNodeCount.
+ ///
+ /// When the coordinator is at capacity, TryRegister will block (heartbeating for position
+ /// updates) until the build is promoted to active.
+ ///
+ internal sealed class BuildCoordinatorClient : IDisposable
+ {
+ private readonly string _buildId;
+ private readonly string _pipeName;
+ private int _grantedNodes;
+ private Timer? _heartbeatTimer;
+ private bool _registered;
+ private Action? _onBudgetChanged;
+
+ internal bool IsConnected => _registered;
+ internal int GrantedNodes => _grantedNodes;
+ internal string BuildId => _buildId;
+
+ internal BuildCoordinatorClient()
+ {
+ _buildId = $"{Environment.ProcessId}-{DateTime.UtcNow.Ticks}";
+ _pipeName = BuildCoordinator.GetPipeName();
+ }
+
+ ///
+ /// Try to register with the coordinator. Returns true if a coordinator was found
+ /// and the build was registered (or promoted from queue).
+ ///
+ /// If the coordinator has capacity, returns immediately with grantedNodes.
+ /// If queued, blocks and heartbeats until promoted, calling onQueuePositionChanged
+ /// with (position, totalQueued, waitSeconds) on each update.
+ ///
+ internal bool TryRegister(int requestedNodes, out int grantedNodes, Action? onQueuePositionChanged = null, CancellationToken ct = default)
+ {
+ grantedNodes = requestedNodes;
+
+ string? response = SendCommand($"REGISTER {_buildId} {requestedNodes}");
+ if (response == null)
+ {
+ return false;
+ }
+
+ // Immediate grant
+ if (response.StartsWith("OK ", StringComparison.Ordinal))
+ {
+ if (int.TryParse(response.AsSpan(3), out int granted) && granted > 0)
+ {
+ _grantedNodes = granted;
+ grantedNodes = granted;
+ _registered = true;
+ return true;
+ }
+
+ return false;
+ }
+
+ // Queued — block and heartbeat until promoted
+ if (response.StartsWith("QUEUED ", StringComparison.Ordinal))
+ {
+ return WaitInQueue(requestedNodes, out grantedNodes, onQueuePositionChanged, ct);
+ }
+
+ return false;
+ }
+
+ ///
+ /// Block heartbeating until the coordinator promotes this build to active.
+ ///
+ private bool WaitInQueue(int requestedNodes, out int grantedNodes, Action? onQueuePositionChanged, CancellationToken ct)
+ {
+ grantedNodes = requestedNodes;
+
+ while (!ct.IsCancellationRequested)
+ {
+ Thread.Sleep(2000); // Heartbeat interval
+
+ string? hbResponse = SendCommand($"HEARTBEAT {_buildId}");
+ if (hbResponse == null)
+ {
+ // Coordinator gone — fall through with original node count
+ return false;
+ }
+
+ // Promoted to active!
+ if (hbResponse.StartsWith("OK ", StringComparison.Ordinal))
+ {
+ if (int.TryParse(hbResponse.AsSpan(3), out int granted) && granted > 0)
+ {
+ _grantedNodes = granted;
+ grantedNodes = granted;
+ _registered = true;
+ return true;
+ }
+
+ return false;
+ }
+
+ // Still queued — parse position info: "QUEUED position totalQueued waitSec"
+ if (hbResponse.StartsWith("QUEUED ", StringComparison.Ordinal))
+ {
+ string[] parts = hbResponse.Split(' ');
+ if (parts.Length >= 4
+ && int.TryParse(parts[1], out int position)
+ && int.TryParse(parts[2], out int totalQueued)
+ && int.TryParse(parts[3], out int waitSec))
+ {
+ onQueuePositionChanged?.Invoke(position, totalQueued, waitSec);
+ }
+ }
+ }
+
+ // Cancelled — unregister
+ SendCommand($"UNREGISTER {_buildId}");
+ return false;
+ }
+
+ ///
+ /// Start periodic heartbeats that update the node budget.
+ /// When the coordinator rebalances (e.g., another build starts or stops),
+ /// the callback fires with the new budget.
+ ///
+ internal void StartHeartbeat(Action onBudgetChanged)
+ {
+ _onBudgetChanged = onBudgetChanged;
+ _heartbeatTimer = new Timer(HeartbeatCallback, null, TimeSpan.FromSeconds(2), TimeSpan.FromSeconds(2));
+ }
+
+ ///
+ /// Unregister from the coordinator and stop heartbeats.
+ ///
+ internal void Unregister()
+ {
+ _heartbeatTimer?.Dispose();
+ _heartbeatTimer = null;
+
+ if (_registered)
+ {
+ SendCommand($"UNREGISTER {_buildId}");
+ _registered = false;
+ }
+ }
+
+ public void Dispose()
+ {
+ Unregister();
+ }
+
+ private void HeartbeatCallback(object? state)
+ {
+ if (!_registered)
+ {
+ return;
+ }
+
+ string? response = SendCommand($"HEARTBEAT {_buildId}");
+ if (response != null && response.StartsWith("OK ", StringComparison.Ordinal))
+ {
+ if (int.TryParse(response.AsSpan(3), out int newBudget) && newBudget > 0 && newBudget != _grantedNodes)
+ {
+ _grantedNodes = newBudget;
+ _onBudgetChanged?.Invoke(newBudget);
+ }
+ }
+ }
+
+ ///
+ /// Send a command to the coordinator and return the response line, or null if connection fails.
+ /// Retries up to 3 times with 300ms delay to handle the brief gap between listener re-binds.
+ ///
+ private string? SendCommand(string command)
+ {
+ for (int attempt = 0; attempt < 3; attempt++)
+ {
+ try
+ {
+ using var client = new NamedPipeClientStream(".", _pipeName, PipeDirection.InOut, System.IO.Pipes.PipeOptions.CurrentUserOnly);
+ client.Connect(1000);
+
+ using var writer = new StreamWriter(client, leaveOpen: true) { AutoFlush = true };
+ using var reader = new StreamReader(client, leaveOpen: true);
+
+ writer.WriteLine(command);
+ return reader.ReadLine();
+ }
+ catch (TimeoutException)
+ {
+ if (attempt < 2)
+ {
+ Thread.Sleep(300);
+ }
+ }
+ catch (IOException)
+ {
+ if (attempt < 2)
+ {
+ Thread.Sleep(300);
+ }
+ }
+ }
+
+ return null;
+ }
+ }
+}
diff --git a/src/Build/BackEnd/BuildManager/BuildManager.cs b/src/Build/BackEnd/BuildManager/BuildManager.cs
index 9656669047a..f1cde74e437 100644
--- a/src/Build/BackEnd/BuildManager/BuildManager.cs
+++ b/src/Build/BackEnd/BuildManager/BuildManager.cs
@@ -267,6 +267,14 @@ public class BuildManager : INodePacketHandler, IBuildComponentHost, IDisposable
private bool _hasProjectCacheServiceInitializedVsScenario;
+#if NET
+ ///
+ /// Client for communicating with an external build coordinator process.
+ /// If a coordinator is running, it dynamically adjusts MaxNodeCount.
+ ///
+ private BuildCoordinatorClient? _coordinatorClient;
+#endif
+
#if DEBUG
///
/// true to wait for a debugger to be attached, otherwise false.
@@ -687,6 +695,12 @@ public void BeginBuild(BuildParameters parameters)
_noNodesActiveEvent!.Set();
}
+ // Try to register with external build coordinator (if running).
+ // This may block if the coordinator queues this build.
+#if NET
+ TryRegisterWithCoordinator();
+#endif
+
ILoggingService InitializeLoggingService()
{
ILoggingService loggingService = CreateLoggingService(
@@ -1026,6 +1040,12 @@ public void EndBuild()
var exceptionsThrownInEndBuild = false;
+ // Unregister from coordinator early so other builds can scale up
+#if NET
+ _coordinatorClient?.Dispose();
+ _coordinatorClient = null;
+#endif
+
try
{
lock (_syncLock)
@@ -2270,7 +2290,16 @@ private void ShutdownConnectedNodes(bool abort)
_executionCancellationTokenSource?.Cancel();
// If we are aborting, we will NOT reuse the nodes because their state may be compromised by attempts to shut down while the build is in-progress.
- _nodeManager?.ShutdownConnectedNodes(!abort && _buildParameters!.EnableNodeReuse);
+ // When a coordinator is managing builds, disable reuse so nodes exit immediately
+ // instead of lingering for 15 minutes — the coordinator handles cross-build lifecycle.
+#if NET
+ bool coordinatorActive = _coordinatorClient != null || (Environment.GetEnvironmentVariable("MSBUILD_COORDINATOR_DISABLE") != "1"
+ && System.IO.File.Exists(BuildCoordinator.GetPipeName()));
+ bool enableReuse = !abort && _buildParameters!.EnableNodeReuse && !coordinatorActive;
+#else
+ bool enableReuse = !abort && _buildParameters!.EnableNodeReuse;
+#endif
+ _nodeManager?.ShutdownConnectedNodes(enableReuse);
// if we are aborting, the task host will hear about it in time through the task building infrastructure;
// so only shut down the task host nodes if we're shutting down tidily (in which case, it is assumed that all
@@ -2323,6 +2352,74 @@ private void VerifyStateInternal(BuildManagerState requiredState)
///
/// Method called to reset the state of the system after a build.
///
+ ///
+ /// Attempts to register this build with an external build coordinator process.
+ /// If a coordinator is running, it may adjust MaxNodeCount or queue the build.
+ /// If no coordinator is running, does nothing and the build proceeds normally.
+ ///
+#if NET
+ private void TryRegisterWithCoordinator()
+ {
+ if (_buildParameters == null)
+ {
+ return;
+ }
+
+ // Allow opt-out via environment variable
+ if (Environment.GetEnvironmentVariable("MSBUILD_COORDINATOR_DISABLE") == "1")
+ {
+ return;
+ }
+
+ var client = new BuildCoordinatorClient();
+ int requestedNodes = _buildParameters.MaxNodeCount;
+
+ bool registered = client.TryRegister(
+ requestedNodes,
+ out int grantedNodes,
+ onQueuePositionChanged: (position, total, waitSec) =>
+ {
+ // Log queue position updates to console
+ Console.Error.WriteLine($" [coordinator] Queued: position {position}/{total}, waiting {waitSec}s");
+ },
+ ct: _executionCancellationTokenSource?.Token ?? CancellationToken.None);
+
+ if (registered)
+ {
+ _coordinatorClient = client;
+
+ if (grantedNodes != requestedNodes)
+ {
+ _buildParameters.MaxNodeCount = grantedNodes;
+ Console.Error.WriteLine($" [coordinator] Node budget: {grantedNodes} (requested {requestedNodes})");
+ }
+
+ // Start heartbeats — coordinator may adjust budget dynamically
+ client.StartHeartbeat(newBudget =>
+ {
+ if (_buildParameters != null && newBudget != _buildParameters.MaxNodeCount)
+ {
+ int old = _buildParameters.MaxNodeCount;
+ _buildParameters.MaxNodeCount = newBudget;
+
+ // If budget decreased, shut down excess worker nodes immediately
+ if (newBudget < old)
+ {
+ _nodeManager?.ShutdownExcessNodes(newBudget);
+ }
+
+ Console.Error.WriteLine($" [coordinator] Budget changed: {old} → {newBudget}");
+ }
+ });
+ }
+ else
+ {
+ // No coordinator found — dispose client and run normally
+ client.Dispose();
+ }
+ }
+#endif
+
private void Reset()
{
_nodeManager?.UnregisterPacketHandler(NodePacketType.BuildRequestBlocker);
diff --git a/src/Build/BackEnd/Components/Communications/INodeManager.cs b/src/Build/BackEnd/Components/Communications/INodeManager.cs
index f5b79fa0ba7..b951c240106 100644
--- a/src/Build/BackEnd/Components/Communications/INodeManager.cs
+++ b/src/Build/BackEnd/Components/Communications/INodeManager.cs
@@ -54,6 +54,11 @@ internal interface INodeManager : IBuildComponent,
void ClearPerBuildState();
IEnumerable GetProcesses();
+
+ ///
+ /// Shuts down out-of-proc nodes beyond the specified limit.
+ ///
+ void ShutdownExcessNodes(int maxNodesToKeep);
#endregion
}
}
diff --git a/src/Build/BackEnd/Components/Communications/NodeEndpointOutOfProc.cs b/src/Build/BackEnd/Components/Communications/NodeEndpointOutOfProc.cs
index 437d741d41b..da46d0fb26f 100644
--- a/src/Build/BackEnd/Components/Communications/NodeEndpointOutOfProc.cs
+++ b/src/Build/BackEnd/Components/Communications/NodeEndpointOutOfProc.cs
@@ -26,7 +26,13 @@ internal NodeEndpointOutOfProc(bool enableReuse, bool lowPriority)
_enableReuse = enableReuse;
LowPriority = lowPriority;
- InternalConstruct();
+ // Use hash-based pipe name for fast discovery on Unix.
+ // Format: MSBuild-{hash}-{pid} — allows schedulers to find compatible nodes
+ // by listing /tmp/MSBuild-{hash}-* instead of probing all dotnet processes.
+ string? pipeName = NativeMethodsShared.IsUnixLike
+ ? NamedPipeUtil.GetHashBasedPipeName(GetHandshake().ComputeHash())
+ : null; // Windows: keep legacy MSBuild{PID} naming
+ InternalConstruct(pipeName);
}
///
diff --git a/src/Build/BackEnd/Components/Communications/NodeManager.cs b/src/Build/BackEnd/Components/Communications/NodeManager.cs
index fb83ceaf5c1..1193a895794 100644
--- a/src/Build/BackEnd/Components/Communications/NodeManager.cs
+++ b/src/Build/BackEnd/Components/Communications/NodeManager.cs
@@ -361,5 +361,13 @@ public IEnumerable GetProcesses()
{
return _outOfProcNodeProvider?.GetProcesses()!;
}
+
+ public void ShutdownExcessNodes(int maxNodesToKeep)
+ {
+ if (_outOfProcNodeProvider is NodeProviderOutOfProc provider)
+ {
+ provider.ShutdownExcessNodes(maxNodesToKeep);
+ }
+ }
}
}
diff --git a/src/Build/BackEnd/Components/Communications/NodeProviderOutOfProc.cs b/src/Build/BackEnd/Components/Communications/NodeProviderOutOfProc.cs
index 02e998cfa1e..7cb32d770d2 100644
--- a/src/Build/BackEnd/Components/Communications/NodeProviderOutOfProc.cs
+++ b/src/Build/BackEnd/Components/Communications/NodeProviderOutOfProc.cs
@@ -152,6 +152,26 @@ public void ShutdownConnectedNodes(bool enableReuse)
ShutdownConnectedNodes(contextsToShutDown, enableReuse);
}
+ ///
+ /// Shuts down connected nodes beyond the specified limit, keeping only nodes.
+ /// Excess nodes are sent NodeBuildComplete with enableReuse=true so they prepare for reuse by a future build.
+ ///
+ public void ShutdownExcessNodes(int maxNodesToKeep)
+ {
+ if (_nodeContexts.Count <= maxNodesToKeep)
+ {
+ return;
+ }
+
+ int toShutdown = _nodeContexts.Count - maxNodesToKeep;
+ var excessContexts = _nodeContexts.Values.Take(toShutdown).ToList();
+
+ foreach (var context in excessContexts)
+ {
+ context.SendData(new NodeBuildComplete(true /* prepare for reuse */));
+ }
+ }
+
///
/// Shuts down all of the managed nodes permanently.
///
diff --git a/src/Build/BackEnd/Components/Communications/NodeProviderOutOfProcBase.cs b/src/Build/BackEnd/Components/Communications/NodeProviderOutOfProcBase.cs
index 9f701208dde..25349a97a4a 100644
--- a/src/Build/BackEnd/Components/Communications/NodeProviderOutOfProcBase.cs
+++ b/src/Build/BackEnd/Components/Communications/NodeProviderOutOfProcBase.cs
@@ -49,6 +49,12 @@ internal abstract partial class NodeProviderOutOfProcBase
///
private const int TimeoutForNewNodeCreation = 30000;
+ ///
+ /// The amount of time to wait when attempting to reuse an existing idle node.
+ /// Must be long enough for a sleeping node to wake and respond to the handshake.
+ ///
+ private const int TimeoutForNodeReuse = 1000;
+
///
/// The amount of time to wait for an out-of-proc node to exit.
///
@@ -255,7 +261,34 @@ protected IList GetNodes(
if (nodeReuseRequested)
{
IList possibleRunningNodesList;
- (expectedProcessName, possibleRunningNodesList) = GetPossibleRunningNodes(msbuildLocation, expectedNodeMode);
+
+ // On Unix, use hash-based pipe file listing for O(1) discovery of compatible nodes
+ // instead of enumerating all dotnet processes and probing each one.
+ if (NativeMethodsShared.IsUnixLike)
+ {
+ string handshakeHash = nodeLaunchData.Handshake.ComputeHash();
+ IList pids = NamedPipeUtil.FindNodesByHandshakeHash(handshakeHash);
+ var processes = new List(pids.Count);
+ foreach (int pid in pids)
+ {
+ try
+ {
+ processes.Add(Process.GetProcessById(pid));
+ }
+ catch
+ {
+ // Process may have exited between pipe file listing and this call.
+ }
+ }
+
+ expectedProcessName = "dotnet";
+ possibleRunningNodesList = processes;
+ }
+ else
+ {
+ (expectedProcessName, possibleRunningNodesList) = GetPossibleRunningNodes(msbuildLocation, expectedNodeMode);
+ }
+
possibleRunningNodes = new ConcurrentQueue(possibleRunningNodesList);
if (possibleRunningNodesList.Count > 0)
@@ -318,7 +351,7 @@ bool TryReuseAnyFromPossibleRunningNodes(int currentProcessId, int nodeId)
_processesToIgnore.TryAdd(nodeLookupKey, default);
// Attempt to connect to each process in turn.
- Stream nodeStream = TryConnectToProcess(nodeToReuse.Id, 0 /* poll, don't wait for connections */, nodeLaunchData.Handshake, out HandshakeResult result);
+ Stream nodeStream = TryConnectToProcess(nodeToReuse.Id, TimeoutForNodeReuse, nodeLaunchData.Handshake, out HandshakeResult result);
if (nodeStream != null)
{
// Connection successful, use this node.
@@ -719,8 +752,11 @@ private static void ValidateRemotePipeSecurityOnWindows(NamedPipeClientStream no
///
private Stream TryConnectToProcess(int nodeProcessId, int timeout, Handshake handshake, out HandshakeResult result)
{
- // Try and connect to the process.
- string pipeName = NamedPipeUtil.GetPlatformSpecificPipeName(nodeProcessId);
+ // On Unix, nodes create pipes with hash-based names for fast discovery.
+ // On Windows, keep legacy MSBuild{PID} naming.
+ string pipeName = NativeMethodsShared.IsUnixLike
+ ? NamedPipeUtil.GetHashBasedPipeName(handshake.ComputeHash(), nodeProcessId)
+ : NamedPipeUtil.GetPlatformSpecificPipeName(nodeProcessId);
#pragma warning disable SA1111, SA1009 // Closing parenthesis should be on line of last parameter
NamedPipeClientStream nodeStream = new NamedPipeClientStream(
diff --git a/src/Build/BackEnd/Components/Communications/TaskHostNodeManager.cs b/src/Build/BackEnd/Components/Communications/TaskHostNodeManager.cs
index 66c881052b8..5aec9f9ca4a 100644
--- a/src/Build/BackEnd/Components/Communications/TaskHostNodeManager.cs
+++ b/src/Build/BackEnd/Components/Communications/TaskHostNodeManager.cs
@@ -184,5 +184,10 @@ IEnumerable INodeManager.GetProcesses()
{
return _outOfProcTaskHostNodeProvider.GetProcesses();
}
+
+ void INodeManager.ShutdownExcessNodes(int maxNodesToKeep)
+ {
+ // Task host nodes are short-lived; no excess to trim.
+ }
}
}
diff --git a/src/Build/Microsoft.Build.csproj b/src/Build/Microsoft.Build.csproj
index 5343eabdc97..9a9ff2f8d26 100644
--- a/src/Build/Microsoft.Build.csproj
+++ b/src/Build/Microsoft.Build.csproj
@@ -169,6 +169,8 @@
+
+
diff --git a/src/MSBuild/XMake.cs b/src/MSBuild/XMake.cs
index 089e93d533a..029d343a8af 100644
--- a/src/MSBuild/XMake.cs
+++ b/src/MSBuild/XMake.cs
@@ -310,7 +310,17 @@ public static int Main(string[] args)
}
int exitCode;
+
+ // Check for coordinator mode: `dotnet msbuild --coordinator [--budget N] [--max-builds N]`
+#if NET
+ if (IsCoordinatorMode(args, out int budget, out int maxBuilds))
+ {
+ exitCode = RunCoordinator(budget, maxBuilds);
+ }
+ else if (
+#else
if (
+#endif
Environment.GetEnvironmentVariable(Traits.UseMSBuildServerEnvVarName) == "1" &&
!Traits.Instance.EscapeHatches.EnsureStdOutForChildNodesIsPrimaryStdout &&
CanRunServerBasedOnCommandLineSwitches(args))
@@ -337,6 +347,63 @@ public static int Main(string[] args)
return exitCode;
}
+ ///
+ /// Check if the command line requests coordinator mode.
+ /// Usage: msbuild --coordinator [--budget N] [--max-builds N]
+ ///
+#if NET
+ private static bool IsCoordinatorMode(string[] args, out int budget, out int maxBuilds)
+ {
+ budget = 0;
+ maxBuilds = 2;
+
+ bool found = false;
+ for (int i = 0; i < args.Length; i++)
+ {
+ if (args[i].Equals("--coordinator", StringComparison.OrdinalIgnoreCase))
+ {
+ found = true;
+ }
+ else if (args[i].Equals("--budget", StringComparison.OrdinalIgnoreCase) && i + 1 < args.Length)
+ {
+ int.TryParse(args[i + 1], out budget);
+ i++;
+ }
+ else if (args[i].Equals("--max-builds", StringComparison.OrdinalIgnoreCase) && i + 1 < args.Length)
+ {
+ int.TryParse(args[i + 1], out maxBuilds);
+ i++;
+ }
+ }
+
+ if (found && budget <= 0)
+ {
+ // Default budget: 80% of logical processors
+ budget = Math.Max(1, (int)(Environment.ProcessorCount * 0.8));
+ }
+
+ return found;
+ }
+
+ ///
+ /// Run the build coordinator as a long-lived process.
+ ///
+ private static int RunCoordinator(int budget, int maxBuilds)
+ {
+ using var coordinator = new Microsoft.Build.BackEnd.BuildCoordinator(budget, maxBuilds);
+ coordinator.Start();
+
+ Console.CancelKeyPress += (_, e) =>
+ {
+ e.Cancel = true;
+ coordinator.Stop();
+ };
+
+ coordinator.WaitForShutdown();
+ return 0;
+ }
+#endif
+
///
/// Returns true if arguments allows or make sense to leverage msbuild server.
///
diff --git a/src/Shared/CommunicationsUtilities.cs b/src/Shared/CommunicationsUtilities.cs
index 8c4f6a50167..96e3191ec3f 100644
--- a/src/Shared/CommunicationsUtilities.cs
+++ b/src/Shared/CommunicationsUtilities.cs
@@ -289,8 +289,14 @@ protected Handshake(HandshakeOptions nodeType, bool includeSessionId, string too
int sessionId = 0;
if (includeSessionId)
{
- using var currentProcess = Process.GetCurrentProcess();
- sessionId = currentProcess.SessionId;
+ if (NativeMethodsShared.IsWindows)
+ {
+ using var currentProcess = Process.GetCurrentProcess();
+ sessionId = currentProcess.SessionId;
+ }
+ // On Unix, getsid() returns the session leader PID which differs per terminal,
+ // preventing cross-terminal node reuse. Use 0 since Unix doesn't need
+ // RDP-style session isolation.
}
_handshakeComponents = IsNetTaskHost
@@ -340,35 +346,15 @@ private static HandshakeComponents CreateStandardComponents(int options, int sal
public virtual string GetKey() => $"{_handshakeComponents.Options} {_handshakeComponents.Salt} {_handshakeComponents.FileVersionMajor} {_handshakeComponents.FileVersionMinor} {_handshakeComponents.FileVersionBuild} {_handshakeComponents.FileVersionPrivate} {_handshakeComponents.SessionId}".ToString(CultureInfo.InvariantCulture);
public virtual byte? ExpectedVersionInFirstByte => CommunicationsUtilities.handshakeVersion;
- }
- internal sealed class ServerNodeHandshake : Handshake
- {
///
/// Caching computed hash.
///
private string _computedHash = null;
- public override byte? ExpectedVersionInFirstByte => null;
-
- internal ServerNodeHandshake(HandshakeOptions nodeType)
- : base(nodeType, includeSessionId: false, toolsDirectory: null)
- {
- }
-
- public override HandshakeComponents RetrieveHandshakeComponents() => new HandshakeComponents(
- CommunicationsUtilities.AvoidEndOfHandshakeSignal(_handshakeComponents.Options),
- CommunicationsUtilities.AvoidEndOfHandshakeSignal(_handshakeComponents.Salt),
- CommunicationsUtilities.AvoidEndOfHandshakeSignal(_handshakeComponents.FileVersionMajor),
- CommunicationsUtilities.AvoidEndOfHandshakeSignal(_handshakeComponents.FileVersionMinor),
- CommunicationsUtilities.AvoidEndOfHandshakeSignal(_handshakeComponents.FileVersionBuild),
- CommunicationsUtilities.AvoidEndOfHandshakeSignal(_handshakeComponents.FileVersionPrivate));
-
- public override string GetKey() => $"{_handshakeComponents.Options} {_handshakeComponents.Salt} {_handshakeComponents.FileVersionMajor} {_handshakeComponents.FileVersionMinor} {_handshakeComponents.FileVersionBuild} {_handshakeComponents.FileVersionPrivate}"
- .ToString(CultureInfo.InvariantCulture);
-
///
/// Computes Handshake stable hash string representing whole state of handshake.
+ /// Used for hash-based pipe naming to enable fast node discovery without trial-and-error probing.
///
public string ComputeHash()
{
@@ -391,6 +377,27 @@ public string ComputeHash()
}
}
+ internal sealed class ServerNodeHandshake : Handshake
+ {
+ public override byte? ExpectedVersionInFirstByte => null;
+
+ internal ServerNodeHandshake(HandshakeOptions nodeType)
+ : base(nodeType, includeSessionId: false, toolsDirectory: null)
+ {
+ }
+
+ public override HandshakeComponents RetrieveHandshakeComponents() => new HandshakeComponents(
+ CommunicationsUtilities.AvoidEndOfHandshakeSignal(_handshakeComponents.Options),
+ CommunicationsUtilities.AvoidEndOfHandshakeSignal(_handshakeComponents.Salt),
+ CommunicationsUtilities.AvoidEndOfHandshakeSignal(_handshakeComponents.FileVersionMajor),
+ CommunicationsUtilities.AvoidEndOfHandshakeSignal(_handshakeComponents.FileVersionMinor),
+ CommunicationsUtilities.AvoidEndOfHandshakeSignal(_handshakeComponents.FileVersionBuild),
+ CommunicationsUtilities.AvoidEndOfHandshakeSignal(_handshakeComponents.FileVersionPrivate));
+
+ public override string GetKey() => $"{_handshakeComponents.Options} {_handshakeComponents.Salt} {_handshakeComponents.FileVersionMajor} {_handshakeComponents.FileVersionMinor} {_handshakeComponents.FileVersionBuild} {_handshakeComponents.FileVersionPrivate}"
+ .ToString(CultureInfo.InvariantCulture);
+ }
+
///
/// This class contains utility methods for the MSBuild engine.
///
@@ -409,7 +416,7 @@ internal static class CommunicationsUtilities
///
/// The timeout to connect to a node.
///
- private const int DefaultNodeConnectionTimeout = 900 * 1000; // 15 minutes; enough time that a dev will typically do another build in this time
+ private const int DefaultNodeConnectionTimeout = 30 * 1000; // 30 seconds
///
/// Whether to trace communications
diff --git a/src/Shared/NamedPipeUtil.cs b/src/Shared/NamedPipeUtil.cs
index 0b85b05bacd..b0d43e70ee4 100644
--- a/src/Shared/NamedPipeUtil.cs
+++ b/src/Shared/NamedPipeUtil.cs
@@ -1,6 +1,7 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
+using System.Collections.Generic;
using System.IO;
using Microsoft.Build.Internal;
@@ -20,6 +21,56 @@ internal static string GetPlatformSpecificPipeName(int? processId = null)
return GetPlatformSpecificPipeName(pipeName);
}
+ ///
+ /// Returns a pipe name that encodes both the handshake hash and the process ID.
+ /// Format: MSBuild-{hash}-{pid}
+ /// This allows discovery of compatible nodes by listing pipes matching the hash prefix,
+ /// eliminating trial-and-error probing of all dotnet processes.
+ ///
+ internal static string GetHashBasedPipeName(string handshakeHash, int? processId = null)
+ {
+ processId ??= EnvironmentUtilities.CurrentProcessId;
+ string pipeName = $"MSBuild-{handshakeHash}-{processId}";
+ return GetPlatformSpecificPipeName(pipeName);
+ }
+
+ ///
+ /// Finds pipe files matching a handshake hash and extracts their PIDs.
+ /// Only works on Unix where pipes are files in /tmp.
+ ///
+ internal static IList FindNodesByHandshakeHash(string handshakeHash)
+ {
+ var pids = new List();
+ // GetPlatformSpecificPipeName returns full paths like /tmp/MSBuild-{hash}-{pid}
+ // on Unix, and .NET does NOT add CoreFxPipe_ prefix for absolute paths.
+ string prefix = $"MSBuild-{handshakeHash}-";
+ string? pipeDir = NativeMethodsShared.IsUnixLike ? "/tmp" : null;
+
+ if (pipeDir == null)
+ {
+ // On Windows, named pipes aren't files — fall back to legacy discovery.
+ return pids;
+ }
+
+ try
+ {
+ foreach (string file in System.IO.Directory.GetFiles(pipeDir, $"MSBuild-{handshakeHash}-*"))
+ {
+ string fileName = Path.GetFileName(file);
+ if (fileName.StartsWith(prefix) && int.TryParse(fileName.Substring(prefix.Length), out int pid))
+ {
+ pids.Add(pid);
+ }
+ }
+ }
+ catch
+ {
+ // Directory enumeration can fail; fall back to legacy.
+ }
+
+ return pids;
+ }
+
internal static string GetPlatformSpecificPipeName(string pipeName)
{
if (NativeMethodsShared.IsUnixLike)
diff --git a/src/Shared/NodeEndpointOutOfProcBase.cs b/src/Shared/NodeEndpointOutOfProcBase.cs
index ae422e25e31..345b7317895 100644
--- a/src/Shared/NodeEndpointOutOfProcBase.cs
+++ b/src/Shared/NodeEndpointOutOfProcBase.cs
@@ -44,8 +44,10 @@ internal abstract class NodeEndpointOutOfProcBase : INodeEndpoint
#if NETCOREAPP2_1_OR_GREATER
///
/// The amount of time to wait for the client to connect to the host.
+ /// Reduced from 60s to 5s so that failed reuse probes don't block idle nodes
+ /// from reaching their connection timeout check.
///
- private const int ClientConnectTimeout = 60000;
+ private const int ClientConnectTimeout = 5000;
#endif // NETCOREAPP2_1
///