From 89600008b9ff0477f4fdcecd769679e4d0203331 Mon Sep 17 00:00:00 2001 From: Rolf Bjarne Kvinge Date: Mon, 27 Apr 2026 16:01:21 +0200 Subject: [PATCH 1/4] [tests] Fix flaky DeskCase_83099_InmutableDictionary keychain test Three changes to eliminate intermittent InvalidRecord failures: 1. Use a process-unique server name (Test1-{PID}) to avoid cross-process keychain conflicts on shared CI agents. 2. Stop attaching LAContext with InteractionNotAllowed to search/delete records. The LAContext can cause intermittent InvalidRecord errors on some macOS keychain states. Plain internet passwords don't require biometric auth, so the LAContext is unnecessary for these operations. 3. Handle unexpected query status codes (e.g. InvalidRecord) by falling through to the add path instead of silently returning false. The previous code only handled ItemNotFound and Success, leaving any other query result as an unrecoverable failure. Fixes https://github.com/dotnet/macios/issues/25222 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- tests/monotouch-test/Security/RecordTest.cs | 117 ++++++++++---------- 1 file changed, 58 insertions(+), 59 deletions(-) diff --git a/tests/monotouch-test/Security/RecordTest.cs b/tests/monotouch-test/Security/RecordTest.cs index 73eb44135dc5..a5cad7bbc76a 100644 --- a/tests/monotouch-test/Security/RecordTest.cs +++ b/tests/monotouch-test/Security/RecordTest.cs @@ -308,115 +308,114 @@ public void AuthenticationType_17579 () #endif public void DeskCase_83099_InmutableDictionary () { + // Use a unique server name per process to avoid cross-process keychain + // conflicts on shared CI agents (the account + server pair is the identity). + var testServer = $"Test1-{Environment.ProcessId}"; var testUsername = "testusername"; - // Clean up any stale keychain entries from previous test runs to avoid - // the keychain returning ItemNotFound on query but DuplicateItem on add. - ForceRemoveUserPassword (testUsername); + // Clean up any stale keychain entries from previous test runs. + var cleanupCode = ForceRemoveKeychainEntry (testServer, testUsername); + TestContext.Out.WriteLine ($"Initial cleanup: {cleanupCode}"); + // Also clean up entries from the old hardcoded "Test1" server name. + ForceRemoveKeychainEntry ("Test1", testUsername); try { //TEST 1: Save a keychain value - var test1 = SaveUserPassword (testUsername, "testValue1", out var queryCode, out var addCode, out var updateCode); + var test1 = SaveKeychainEntry (testServer, testUsername, "testValue1", out var queryCode, out var addCode, out var updateCode); Assert.IsTrue (test1, $"Password could not be saved to keychain. queryCode: {queryCode} addCode: {addCode} updateCode: {updateCode}"); //TEST 2: Get the saved keychain value - var test2 = GetUserPassword (testUsername); + var test2 = GetKeychainEntry (testServer, testUsername); Assert.IsTrue (StringUtil.StringsEqual (test2, "testValue1", false)); //TEST 3: Update the keychain value - var test3 = SaveUserPassword (testUsername, "testValue2", out queryCode, out addCode, out updateCode); + var test3 = SaveKeychainEntry (testServer, testUsername, "testValue2", out queryCode, out addCode, out updateCode); Assert.IsTrue (test3, $"Password could not be saved to keychain. queryCode: {queryCode} addCode: {addCode} updateCode: {updateCode}"); //TEST 4: Get the updated keychain value - var test4 = GetUserPassword (testUsername); + var test4 = GetKeychainEntry (testServer, testUsername); Assert.IsTrue (StringUtil.StringsEqual (test4, "testValue2", false)); //TEST 5: Clear the keychain values - var test5 = ClearUserPassword (testUsername, out queryCode, out var removeCode); + var test5 = ClearKeychainEntry (testServer, testUsername, out queryCode, out var removeCode); Assert.IsTrue (test5, $"Password could not be cleared from keychain. queryCode: {queryCode} removeCode: {removeCode}"); //TEST 6: Verify no keychain value - var test6 = GetUserPassword (testUsername); + var test6 = GetKeychainEntry (testServer, testUsername); Assert.IsNull (test6, "No password should exist here"); } finally { // Always clean up to avoid leaving stale entries for subsequent runs - ForceRemoveUserPassword (testUsername); + ForceRemoveKeychainEntry (testServer, testUsername); } } - public static string GetUserPassword (string username) + // Create a minimal SecRecord for keychain queries and deletes (no LAContext). + // Using LAContext with InteractionNotAllowed on search records can cause + // intermittent InvalidRecord errors on some macOS keychain states. + static SecRecord CreateKeychainSearchRecord (string server, string username) + { + return new SecRecord (SecKind.InternetPassword) { + Server = server, + Account = username.ToLower (), + }; + } + + public static string GetKeychainEntry (string server, string username) { string password = null; - var searchRecord = CreateSecRecord (SecKind.InternetPassword, - server: "Test1", - account: username.ToLower () - ); - SecStatusCode code; - var record = SecKeyChain.QueryAsRecord (searchRecord, out code); + var searchRecord = CreateKeychainSearchRecord (server, username); + var record = SecKeyChain.QueryAsRecord (searchRecord, out var code); if (code == SecStatusCode.Success && record is not null) password = NSString.FromData (record.ValueData, NSStringEncoding.UTF8); return password; } - public static bool SaveUserPassword (string username, string password, out SecStatusCode queryCode, out SecStatusCode addCode, out SecStatusCode updateCode) + public static bool SaveKeychainEntry (string server, string username, string password, out SecStatusCode queryCode, out SecStatusCode addCode, out SecStatusCode updateCode) { - addCode = (SecStatusCode) (-1); // pick a value that doesn't already exist in SecStatusCode - updateCode = (SecStatusCode) (-1); // pick a value that doesn't already exist in SecStatusCode - var success = false; - var searchRecord = CreateSecRecord (SecKind.InternetPassword, - server: "Test1", - account: username.ToLower () - ); + addCode = (SecStatusCode) (-1); + updateCode = (SecStatusCode) (-1); + var searchRecord = CreateKeychainSearchRecord (server, username); var record = SecKeyChain.QueryAsRecord (searchRecord, out queryCode); - if (queryCode == SecStatusCode.ItemNotFound) { - record = CreateSecRecord (SecKind.InternetPassword, - server: "Test1", - account: username.ToLower (), - valueData: NSData.FromString (password) - ); - addCode = SecKeyChain.Add (record); - success = (addCode == SecStatusCode.Success); - // Handle inconsistent keychain state: query returned ItemNotFound - // but add returned DuplicateItem. Force-remove and retry. - if (addCode == SecStatusCode.DuplicateItem) { - SecKeyChain.Remove (searchRecord); - addCode = SecKeyChain.Add (record); - success = (addCode == SecStatusCode.Success); - } - } if (queryCode == SecStatusCode.Success && record is not null) { + // Record exists, update it. record.ValueData = NSData.FromString (password); updateCode = SecKeyChain.Update (searchRecord, record); - success = (updateCode == SecStatusCode.Success); + return updateCode == SecStatusCode.Success; + } + // Record doesn't exist, or query returned an unexpected error + // (e.g. InvalidRecord). Force-remove to handle inconsistent keychain + // state, then add. + SecKeyChain.Remove (searchRecord); + record = new SecRecord (SecKind.InternetPassword) { + Server = server, + Account = username.ToLower (), + ValueData = NSData.FromString (password), + }; + addCode = SecKeyChain.Add (record); + if (addCode == SecStatusCode.DuplicateItem) { + SecKeyChain.Remove (searchRecord); + addCode = SecKeyChain.Add (record); } - return success; + return addCode == SecStatusCode.Success; } - public static void ForceRemoveUserPassword (string username) + public static SecStatusCode ForceRemoveKeychainEntry (string server, string username) { - var searchRecord = CreateSecRecord (SecKind.InternetPassword, - server: "Test1", - account: username.ToLower () - ); - SecKeyChain.Remove (searchRecord); + var searchRecord = CreateKeychainSearchRecord (server, username); + return SecKeyChain.Remove (searchRecord); } - public static bool ClearUserPassword (string username, out SecStatusCode queryCode, out SecStatusCode? removeCode) + public static bool ClearKeychainEntry (string server, string username, out SecStatusCode queryCode, out SecStatusCode? removeCode) { - var success = false; - var searchRecord = CreateSecRecord (SecKind.InternetPassword, - server: "Test1", - account: username.ToLower () - ); + var searchRecord = CreateKeychainSearchRecord (server, username); var record = SecKeyChain.QueryAsRecord (searchRecord, out queryCode); if (queryCode == SecStatusCode.Success && record is not null) { removeCode = SecKeyChain.Remove (searchRecord); - success = (removeCode == SecStatusCode.Success); - } else { - removeCode = null; + return removeCode == SecStatusCode.Success; } - return success; + removeCode = null; + return false; } [Test] From c9d210ba1b73fbb635d05966609243d11259cee3 Mon Sep 17 00:00:00 2001 From: Rolf Bjarne Kvinge Date: Mon, 27 Apr 2026 19:12:19 +0200 Subject: [PATCH 2/4] Add fix-random-ci-test-failure skill for diagnosing flaky tests Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../fix-random-ci-test-failure/SKILL.md | 108 ++++++++++++++++++ 1 file changed, 108 insertions(+) create mode 100644 .github/skills/fix-random-ci-test-failure/SKILL.md diff --git a/.github/skills/fix-random-ci-test-failure/SKILL.md b/.github/skills/fix-random-ci-test-failure/SKILL.md new file mode 100644 index 000000000000..ce55a66650de --- /dev/null +++ b/.github/skills/fix-random-ci-test-failure/SKILL.md @@ -0,0 +1,108 @@ +--- +name: fix-random-ci-test-failure +description: >- + Investigate and fix flaky/random CI test failures in dotnet/macios. Trigger on + GitHub issues describing intermittent test failures, CI postmortem issues, or when + asked to fix a flaky test. Analyzes test code, identifies root causes + (shared state, environment dependencies, race conditions), and applies fixes. +--- + +# Fix Random CI Test Failure + +Investigate and fix flaky CI test failures reported in GitHub issues for dotnet/macios. + +## When to Use + +- User shares a GitHub issue about a flaky/intermittent test failure +- Issue has the `ci-postmortem` label +- User asks to fix a randomly failing test +- Test failures appear across unrelated PRs (indicating environment-dependent flakiness) + +## Workflow + +### 1. Gather Failure Context + +Read the GitHub issue to extract: +- **Test name** and **test suite** (e.g., monotouch-test) +- **Error message** and **assertion failure details** +- **Platform** (iOS, macOS, tvOS, Mac Catalyst) +- **Affected builds** — check if failures span unrelated PRs (confirms flakiness vs. regression) + +If build URLs are provided, inspect logs for additional context (stack traces, timing info). + +### 2. Locate and Analyze the Test + +Find the test source file: +``` +grep -r "TestMethodName" tests/ +``` + +Read the **full test method** and all helper methods it calls. Understand: +- What external resources does it use? (keychain, file system, network, simulators) +- Does it use shared/hardcoded identifiers that could collide across parallel runs? +- Does it properly clean up before and after execution? +- Are there error paths that silently swallow failures? + +### 3. Identify Root Cause Category + +Common flaky test root causes in this repo: + +#### Shared State / Resource Conflicts +- **Symptom**: Test uses hardcoded identifiers (e.g., fixed keychain entries, file paths, port numbers) +- **Fix**: Use process-unique identifiers (PID, GUID, bundle ID + test name) +- **Reference**: `tests/monotouch-test/Security/KeyChainTest.cs` uses per-process unique IDs + +#### Environment-Dependent State +- **Symptom**: Test depends on OS-level state (keychain, permissions, network availability) +- **Fix**: Add robust setup/teardown, handle unexpected initial states, add retry logic for transient errors + +#### Unhandled Error Codes +- **Symptom**: Code only handles expected success/failure codes, silently fails on unexpected ones +- **Fix**: Add fallback handling for unexpected status codes, log diagnostic info + +#### Race Conditions / Timing +- **Symptom**: Test passes locally but fails intermittently in CI +- **Fix**: Add proper synchronization, increase timeouts, avoid timing-dependent assertions + +#### LAContext / Authentication Issues (Security tests) +- **Symptom**: `InvalidRecord` or authentication-related errors on keychain operations +- **Fix**: Only attach `LAContext` where actually needed (not on plain query/delete operations) + +### 4. Apply the Fix + +When fixing: + +1. **Prefer unique identifiers over shared ones.** Use `Environment.ProcessId`, `Guid.NewGuid()`, or `{bundleId}-{testName}-{pid}` patterns for resource identifiers. + +2. **Create minimal query records.** For search/delete operations, don't attach unnecessary attributes (like `LAContext`) that can cause intermittent errors. + +3. **Handle all status codes.** Never silently return `false` for unexpected error codes. Either handle them with a fallback path or fail with a descriptive assertion message. + +4. **Add diagnostic logging.** Use `TestContext.Out.WriteLine` to log operation results that would help diagnose future failures. + +5. **Clean up legacy state.** If renaming identifiers, also clean up entries from old names that may linger on CI agents. + +6. **Always clean up in `finally` blocks.** Ensure test resources are released even on failure. + +### 5. If the Fix is Unclear + +If you cannot determine the root cause with available information: + +1. **Add diagnostic logging** to capture operation results (status codes, error details) in future failures. +2. **Log the initial state** at test start (e.g., does the resource already exist before the test runs?). +3. **Include all relevant status codes** in assertion failure messages. +4. Explain to the user what additional information the logging will provide and what hypotheses it will help test. + +### 6. Create the PR + +- Branch naming: `dev/rolf/fix-{test-name}` or similar +- Commit message should explain all root causes addressed +- Reference the GitHub issue with `Fixes #NNNN` +- Add the `copilot` label to the PR + +## Key Patterns in This Repo + +- Keychain tests in `tests/monotouch-test/Security/` are particularly prone to flakiness due to shared macOS keychain state on CI agents +- `InitSecRecord` in `RecordTest.cs` attaches `LAContext` to all `SecRecord` instances — this is sometimes unnecessary and can cause `InvalidRecord` errors +- `KeyChainTest.cs` demonstrates the recommended pattern: per-process unique identifiers using bundle ID + test type + PID +- Tests run on shared CI agent machines where leftover state from previous runs can cause interference From 705a4c4f6d0a7e014753e97393e5235b2295c59d Mon Sep 17 00:00:00 2001 From: Rolf Bjarne Kvinge Date: Mon, 27 Apr 2026 19:13:21 +0200 Subject: [PATCH 3/4] Revert "Add fix-random-ci-test-failure skill for diagnosing flaky tests" This reverts commit c9d210ba1b73fbb635d05966609243d11259cee3. --- .../fix-random-ci-test-failure/SKILL.md | 108 ------------------ 1 file changed, 108 deletions(-) delete mode 100644 .github/skills/fix-random-ci-test-failure/SKILL.md diff --git a/.github/skills/fix-random-ci-test-failure/SKILL.md b/.github/skills/fix-random-ci-test-failure/SKILL.md deleted file mode 100644 index ce55a66650de..000000000000 --- a/.github/skills/fix-random-ci-test-failure/SKILL.md +++ /dev/null @@ -1,108 +0,0 @@ ---- -name: fix-random-ci-test-failure -description: >- - Investigate and fix flaky/random CI test failures in dotnet/macios. Trigger on - GitHub issues describing intermittent test failures, CI postmortem issues, or when - asked to fix a flaky test. Analyzes test code, identifies root causes - (shared state, environment dependencies, race conditions), and applies fixes. ---- - -# Fix Random CI Test Failure - -Investigate and fix flaky CI test failures reported in GitHub issues for dotnet/macios. - -## When to Use - -- User shares a GitHub issue about a flaky/intermittent test failure -- Issue has the `ci-postmortem` label -- User asks to fix a randomly failing test -- Test failures appear across unrelated PRs (indicating environment-dependent flakiness) - -## Workflow - -### 1. Gather Failure Context - -Read the GitHub issue to extract: -- **Test name** and **test suite** (e.g., monotouch-test) -- **Error message** and **assertion failure details** -- **Platform** (iOS, macOS, tvOS, Mac Catalyst) -- **Affected builds** — check if failures span unrelated PRs (confirms flakiness vs. regression) - -If build URLs are provided, inspect logs for additional context (stack traces, timing info). - -### 2. Locate and Analyze the Test - -Find the test source file: -``` -grep -r "TestMethodName" tests/ -``` - -Read the **full test method** and all helper methods it calls. Understand: -- What external resources does it use? (keychain, file system, network, simulators) -- Does it use shared/hardcoded identifiers that could collide across parallel runs? -- Does it properly clean up before and after execution? -- Are there error paths that silently swallow failures? - -### 3. Identify Root Cause Category - -Common flaky test root causes in this repo: - -#### Shared State / Resource Conflicts -- **Symptom**: Test uses hardcoded identifiers (e.g., fixed keychain entries, file paths, port numbers) -- **Fix**: Use process-unique identifiers (PID, GUID, bundle ID + test name) -- **Reference**: `tests/monotouch-test/Security/KeyChainTest.cs` uses per-process unique IDs - -#### Environment-Dependent State -- **Symptom**: Test depends on OS-level state (keychain, permissions, network availability) -- **Fix**: Add robust setup/teardown, handle unexpected initial states, add retry logic for transient errors - -#### Unhandled Error Codes -- **Symptom**: Code only handles expected success/failure codes, silently fails on unexpected ones -- **Fix**: Add fallback handling for unexpected status codes, log diagnostic info - -#### Race Conditions / Timing -- **Symptom**: Test passes locally but fails intermittently in CI -- **Fix**: Add proper synchronization, increase timeouts, avoid timing-dependent assertions - -#### LAContext / Authentication Issues (Security tests) -- **Symptom**: `InvalidRecord` or authentication-related errors on keychain operations -- **Fix**: Only attach `LAContext` where actually needed (not on plain query/delete operations) - -### 4. Apply the Fix - -When fixing: - -1. **Prefer unique identifiers over shared ones.** Use `Environment.ProcessId`, `Guid.NewGuid()`, or `{bundleId}-{testName}-{pid}` patterns for resource identifiers. - -2. **Create minimal query records.** For search/delete operations, don't attach unnecessary attributes (like `LAContext`) that can cause intermittent errors. - -3. **Handle all status codes.** Never silently return `false` for unexpected error codes. Either handle them with a fallback path or fail with a descriptive assertion message. - -4. **Add diagnostic logging.** Use `TestContext.Out.WriteLine` to log operation results that would help diagnose future failures. - -5. **Clean up legacy state.** If renaming identifiers, also clean up entries from old names that may linger on CI agents. - -6. **Always clean up in `finally` blocks.** Ensure test resources are released even on failure. - -### 5. If the Fix is Unclear - -If you cannot determine the root cause with available information: - -1. **Add diagnostic logging** to capture operation results (status codes, error details) in future failures. -2. **Log the initial state** at test start (e.g., does the resource already exist before the test runs?). -3. **Include all relevant status codes** in assertion failure messages. -4. Explain to the user what additional information the logging will provide and what hypotheses it will help test. - -### 6. Create the PR - -- Branch naming: `dev/rolf/fix-{test-name}` or similar -- Commit message should explain all root causes addressed -- Reference the GitHub issue with `Fixes #NNNN` -- Add the `copilot` label to the PR - -## Key Patterns in This Repo - -- Keychain tests in `tests/monotouch-test/Security/` are particularly prone to flakiness due to shared macOS keychain state on CI agents -- `InitSecRecord` in `RecordTest.cs` attaches `LAContext` to all `SecRecord` instances — this is sometimes unnecessary and can cause `InvalidRecord` errors -- `KeyChainTest.cs` demonstrates the recommended pattern: per-process unique identifiers using bundle ID + test type + PID -- Tests run on shared CI agent machines where leftover state from previous runs can cause interference From e77faa873a230ed37ef1df49a61a62574876c1e3 Mon Sep 17 00:00:00 2001 From: Rolf Bjarne Kvinge Date: Wed, 29 Apr 2026 10:42:25 +0200 Subject: [PATCH 4/4] Add macOS 11 hang guard to DeskCase_83099_InmutableDictionary The new helper methods bypass InitSecRecord (intentionally, to avoid LAContext), but that also skipped the macOS 11.* CI hang guard. Add the guard directly to the test method. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- tests/monotouch-test/Security/RecordTest.cs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/monotouch-test/Security/RecordTest.cs b/tests/monotouch-test/Security/RecordTest.cs index a5cad7bbc76a..314cb4fe31fb 100644 --- a/tests/monotouch-test/Security/RecordTest.cs +++ b/tests/monotouch-test/Security/RecordTest.cs @@ -308,6 +308,11 @@ public void AuthenticationType_17579 () #endif public void DeskCase_83099_InmutableDictionary () { +#if __MACOS__ + // macOS 11.* hangs on keychain operations in CI + if (TestRuntime.CheckXcodeVersion (12, 2) && !TestRuntime.CheckXcodeVersion (13, 0)) + TestRuntime.IgnoreInCI ("Skip on macOS 11.* because it hangs"); +#endif // Use a unique server name per process to avoid cross-process keychain // conflicts on shared CI agents (the account + server pair is the identity). var testServer = $"Test1-{Environment.ProcessId}";