From 3ca5cd04f2787e1f41145d7b642d5526c9a17f84 Mon Sep 17 00:00:00 2001
From: Daniel Edrisian <dedrisian@openai.com>
Date: Sun, 14 Sep 2025 14:50:48 -0700
Subject: [PATCH 1/4] Fix flaky

---
 .github/workflows/rust-ci.yml        | 74 ++++++++++++++--------------
 codex-rs/core/tests/suite/compact.rs | 57 +++++++++++++++++++--
 codex-rs/core/tests/suite/review.rs  | 37 +++++++++++---
 3 files changed, 121 insertions(+), 47 deletions(-)

diff --git a/.github/workflows/rust-ci.yml b/.github/workflows/rust-ci.yml
index 280939c611d..18ffe887999 100644
--- a/.github/workflows/rust-ci.yml
+++ b/.github/workflows/rust-ci.yml
@@ -85,7 +85,7 @@ jobs:
 
   # --- CI to validate on different os/targets --------------------------------
   lint_build_test:
-    name: ${{ matrix.runner }} - ${{ matrix.target }}${{ matrix.profile == 'release' && ' (release)' || '' }}
+    name: ${{ matrix.runner }} - ${{ matrix.target }} [run ${{ matrix.repeat }}]${{ matrix.profile == 'release' && ' (release)' || '' }}
     runs-on: ${{ matrix.runner }}
     timeout-minutes: 30
     needs: changed
@@ -99,47 +99,49 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - runner: macos-14
-            target: aarch64-apple-darwin
-            profile: dev
-          - runner: macos-14
-            target: x86_64-apple-darwin
-            profile: dev
-          - runner: ubuntu-24.04
-            target: x86_64-unknown-linux-musl
-            profile: dev
-          - runner: ubuntu-24.04
-            target: x86_64-unknown-linux-gnu
-            profile: dev
-          - runner: ubuntu-24.04-arm
-            target: aarch64-unknown-linux-musl
-            profile: dev
-          - runner: ubuntu-24.04-arm
-            target: aarch64-unknown-linux-gnu
-            profile: dev
+          # - runner: macos-14
+          #   target: aarch64-apple-darwin
+          #   profile: dev
+          # - runner: macos-14
+          #   target: x86_64-apple-darwin
+          #   profile: dev
+          # - runner: ubuntu-24.04
+          #   target: x86_64-unknown-linux-musl
+          #   profile: dev
+          # - runner: ubuntu-24.04
+          #   target: x86_64-unknown-linux-gnu
+          #   profile: dev
+          # - runner: ubuntu-24.04-arm
+          #   target: aarch64-unknown-linux-musl
+          #   profile: dev
+          # - runner: ubuntu-24.04-arm
+          #   target: aarch64-unknown-linux-gnu
+          #   profile: dev
           - runner: windows-latest
             target: x86_64-pc-windows-msvc
             profile: dev
-          - runner: windows-11-arm
-            target: aarch64-pc-windows-msvc
-            profile: dev
+          # - runner: windows-11-arm
+          #   target: aarch64-pc-windows-msvc
+          #   profile: dev
 
           # Also run representative release builds on Mac and Linux because
           # there could be release-only build errors we want to catch.
           # Hopefully this also pre-populates the build cache to speed up
           # releases.
-          - runner: macos-14
-            target: aarch64-apple-darwin
-            profile: release
-          - runner: ubuntu-24.04
-            target: x86_64-unknown-linux-musl
-            profile: release
-          - runner: windows-latest
-            target: x86_64-pc-windows-msvc
-            profile: release
-          - runner: windows-11-arm
-            target: aarch64-pc-windows-msvc
-            profile: release
+          # - runner: macos-14
+          #   target: aarch64-apple-darwin
+          #   profile: release
+          # - runner: ubuntu-24.04
+          #   target: x86_64-unknown-linux-musl
+          #   profile: release
+          # - runner: windows-latest
+          #   target: x86_64-pc-windows-msvc
+          #   profile: release
+          # - runner: windows-11-arm
+          #   target: aarch64-pc-windows-msvc
+          #   profile: release
+
+        repeat: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]
 
     steps:
       - uses: actions/checkout@v5
@@ -165,7 +167,7 @@ jobs:
 
       - name: cargo clippy
         id: clippy
-        run: cargo clippy --target ${{ matrix.target }} --all-features --tests --profile ${{ matrix.profile }} -- -D warnings
+        run: cargo clippy --target ${{ matrix.target }} -p codex-core --tests --profile ${{ matrix.profile }}
 
       # Running `cargo build` from the workspace root builds the workspace using
       # the union of all features from third-party crates. This can mask errors
@@ -190,7 +192,7 @@ jobs:
         # Tests take too long for release builds to run them on every PR.
         if: ${{ matrix.profile != 'release' }}
         continue-on-error: true
-        run: cargo nextest run --all-features --no-fail-fast --target ${{ matrix.target }}
+        run: cargo nextest run -p codex-core --no-fail-fast --target ${{ matrix.target }}
         env:
           RUST_BACKTRACE: 1
 
diff --git a/codex-rs/core/tests/suite/compact.rs b/codex-rs/core/tests/suite/compact.rs
index 72f4021e83c..a8bd9bfbd47 100644
--- a/codex-rs/core/tests/suite/compact.rs
+++ b/codex-rs/core/tests/suite/compact.rs
@@ -1,6 +1,7 @@
 #![expect(clippy::unwrap_used)]
 
 use codex_core::CodexAuth;
+use codex_core::CodexConversation;
 use codex_core::ConversationManager;
 use codex_core::ModelProviderInfo;
 use codex_core::NewConversation;
@@ -16,6 +17,8 @@ use core_test_support::load_default_config_for_test;
 use core_test_support::wait_for_event;
 use serde_json::Value;
 use tempfile::TempDir;
+use tokio::time::Duration;
+use tokio::time::timeout;
 use wiremock::BodyPrintLimit;
 use wiremock::Mock;
 use wiremock::MockServer;
@@ -126,6 +129,22 @@ async fn start_mock_server() -> MockServer {
         .await
 }
 
+async fn wait_for_non_auto_task_complete(codex: &CodexConversation, wait_time: Duration) {
+    loop {
+        let wait_budget = wait_time.max(Duration::from_secs(5));
+        let event = match timeout(wait_budget, codex.next_event()).await {
+            Ok(Ok(ev)) => ev,
+            Ok(Err(_)) => panic!("stream ended unexpectedly"),
+            Err(_) => panic!("timeout waiting for event"),
+        };
+        if let EventMsg::TaskComplete(_) = &event.msg
+            && !event.id.starts_with("auto-compact-")
+        {
+            break;
+        }
+    }
+}
+
 pub(super) const FIRST_REPLY: &str = "FIRST_REPLY";
 pub(super) const SUMMARY_TEXT: &str = "SUMMARY_ONLY_CONTEXT";
 pub(super) const SUMMARIZE_TRIGGER: &str = "Start Summarization";
@@ -366,7 +385,8 @@ async fn summarize_context_three_requests_and_instructions() {
     );
 }
 
-#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+#[cfg_attr(windows, tokio::test(flavor = "multi_thread", worker_threads = 4))]
+#[cfg_attr(not(windows), tokio::test(flavor = "multi_thread", worker_threads = 2))]
 async fn auto_compact_runs_after_token_limit_hit() {
     if std::env::var(CODEX_SANDBOX_NETWORK_DISABLED_ENV_VAR).is_ok() {
         println!(
@@ -453,7 +473,8 @@ async fn auto_compact_runs_after_token_limit_hit() {
         })
         .await
         .unwrap();
-    wait_for_event(&codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;
+
+    wait_for_non_auto_task_complete(&codex, Duration::from_secs(20)).await;
 
     codex
         .submit(Op::UserInput {
@@ -463,13 +484,39 @@ async fn auto_compact_runs_after_token_limit_hit() {
         })
         .await
         .unwrap();
-    wait_for_event(&codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;
+
+    wait_for_non_auto_task_complete(&codex, Duration::from_secs(20)).await;
     // wait_for_event(&codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;
 
     let requests = server.received_requests().await.unwrap();
-    assert_eq!(requests.len(), 3, "auto compact should add a third request");
+    assert!(
+        requests.len() >= 3,
+        "auto compact should add at least a third request, got {}",
+        requests.len()
+    );
+    let is_auto_compact = |req: &wiremock::Request| {
+        std::str::from_utf8(&req.body)
+            .unwrap_or("")
+            .contains("You have exceeded the maximum number of tokens")
+    };
+    let auto_compact_count = requests.iter().filter(|req| is_auto_compact(req)).count();
+    assert_eq!(
+        auto_compact_count, 1,
+        "expected exactly one auto compact request"
+    );
+    let auto_compact_index = requests
+        .iter()
+        .enumerate()
+        .find_map(|(idx, req)| is_auto_compact(req).then_some(idx))
+        .expect("auto compact request missing");
+    assert_eq!(
+        auto_compact_index, 2,
+        "auto compact should add a third request"
+    );
 
-    let body3 = requests[2].body_json::<serde_json::Value>().unwrap();
+    let body3 = requests[auto_compact_index]
+        .body_json::<serde_json::Value>()
+        .unwrap();
     let instructions = body3
         .get("instructions")
         .and_then(|v| v.as_str())
diff --git a/codex-rs/core/tests/suite/review.rs b/codex-rs/core/tests/suite/review.rs
index 21d447e25d2..cf5d650ee86 100644
--- a/codex-rs/core/tests/suite/review.rs
+++ b/codex-rs/core/tests/suite/review.rs
@@ -16,6 +16,7 @@ use codex_core::spawn::CODEX_SANDBOX_NETWORK_DISABLED_ENV_VAR;
 use core_test_support::load_default_config_for_test;
 use core_test_support::load_sse_fixture_with_id_from_str;
 use core_test_support::wait_for_event;
+use core_test_support::wait_for_event_with_timeout;
 use pretty_assertions::assert_eq;
 use std::path::PathBuf;
 use std::sync::Arc;
@@ -118,7 +119,8 @@ async fn review_op_emits_lifecycle_and_review_output() {
 /// When the model returns plain text that is not JSON, ensure the child
 /// lifecycle still occurs and the plain text is surfaced via
 /// ExitedReviewMode(Some(..)) as the overall_explanation.
-#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+#[cfg_attr(windows, tokio::test(flavor = "multi_thread", worker_threads = 4))]
+#[cfg_attr(not(windows), tokio::test(flavor = "multi_thread", worker_threads = 2))]
 async fn review_op_with_plain_text_emits_review_fallback() {
     if std::env::var(CODEX_SANDBOX_NETWORK_DISABLED_ENV_VAR).is_ok() {
         println!(
@@ -168,7 +170,8 @@ async fn review_op_with_plain_text_emits_review_fallback() {
 
 /// When the model returns structured JSON in a review, ensure no AgentMessage
 /// is emitted; the UI consumes the structured result via ExitedReviewMode.
-#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+#[cfg_attr(windows, tokio::test(flavor = "multi_thread", worker_threads = 4))]
+#[cfg_attr(not(windows), tokio::test(flavor = "multi_thread", worker_threads = 2))]
 async fn review_does_not_emit_agent_message_on_structured_output() {
     if std::env::var(CODEX_SANDBOX_NETWORK_DISABLED_ENV_VAR).is_ok() {
         println!(
@@ -293,7 +296,8 @@ async fn review_uses_custom_review_model_from_config() {
 /// When a review session begins, it must not prepend prior chat history from
 /// the parent session. The request `input` should contain only the review
 /// prompt from the user.
-#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+#[cfg_attr(windows, tokio::test(flavor = "multi_thread", worker_threads = 4))]
+#[cfg_attr(not(windows), tokio::test(flavor = "multi_thread", worker_threads = 2))]
 async fn review_input_isolated_from_parent_history() {
     if std::env::var(CODEX_SANDBOX_NETWORK_DISABLED_ENV_VAR).is_ok() {
         println!(
@@ -393,9 +397,26 @@ async fn review_input_isolated_from_parent_history() {
         .await
         .unwrap();
 
-    let _entered = wait_for_event(&codex, |ev| matches!(ev, EventMsg::EnteredReviewMode(_))).await;
-    let _closed = wait_for_event(&codex, |ev| matches!(ev, EventMsg::ExitedReviewMode(None))).await;
-    let _complete = wait_for_event(&codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;
+    let _entered = wait_for_event_with_timeout(
+        &codex,
+        |ev| matches!(ev, EventMsg::EnteredReviewMode(_)),
+        tokio::time::Duration::from_secs(20),
+    )
+    .await;
+
+    let _closed = wait_for_event_with_timeout(
+        &codex,
+        |ev| matches!(ev, EventMsg::ExitedReviewMode(_)), // or ExitedReviewMode(None) as appropriate
+        tokio::time::Duration::from_secs(20),
+    )
+    .await;
+
+    let _complete = wait_for_event_with_timeout(
+        &codex,
+        |ev| matches!(ev, EventMsg::TaskComplete(_)),
+        tokio::time::Duration::from_secs(20),
+    )
+    .await;
 
     // Assert the request `input` contains only the single review user message.
     let request = &server.received_requests().await.unwrap()[0];
@@ -507,6 +528,8 @@ async fn start_responses_server_with_sse(sse_raw: &str, expected_requests: usize
         .respond_with(
             ResponseTemplate::new(200)
                 .insert_header("content-type", "text/event-stream")
+                .insert_header("connection", "close")
+                .insert_header("content-length", sse.len().to_string())
                 .set_body_raw(sse.clone(), "text/event-stream"),
         )
         .expect(expected_requests as u64)
@@ -527,6 +550,8 @@ where
 {
     let model_provider = ModelProviderInfo {
         base_url: Some(format!("{}/v1", server.uri())),
+        // Give Windows runners breathing room.
+        stream_idle_timeout_ms: Some(if cfg!(windows) { 10_000 } else { 1_000 }),
         ..built_in_model_providers()["openai"].clone()
     };
     let mut config = load_default_config_for_test(codex_home);

From c5ad3cb029dfe5296a2a86901413752263f19b73 Mon Sep 17 00:00:00 2001
From: Daniel Edrisian <dedrisian@openai.com>
Date: Sun, 14 Sep 2025 14:57:28 -0700
Subject: [PATCH 2/4] rev

---
 codex-rs/core/tests/suite/review.rs | 28 ++++------------------------
 1 file changed, 4 insertions(+), 24 deletions(-)

diff --git a/codex-rs/core/tests/suite/review.rs b/codex-rs/core/tests/suite/review.rs
index cf5d650ee86..b79c59ba5f1 100644
--- a/codex-rs/core/tests/suite/review.rs
+++ b/codex-rs/core/tests/suite/review.rs
@@ -22,6 +22,7 @@ use std::path::PathBuf;
 use std::sync::Arc;
 use tempfile::TempDir;
 use tokio::io::AsyncWriteExt as _;
+use tokio::time::Duration;
 use uuid::Uuid;
 use wiremock::Mock;
 use wiremock::MockServer;
@@ -397,26 +398,9 @@ async fn review_input_isolated_from_parent_history() {
         .await
         .unwrap();
 
-    let _entered = wait_for_event_with_timeout(
-        &codex,
-        |ev| matches!(ev, EventMsg::EnteredReviewMode(_)),
-        tokio::time::Duration::from_secs(20),
-    )
-    .await;
-
-    let _closed = wait_for_event_with_timeout(
-        &codex,
-        |ev| matches!(ev, EventMsg::ExitedReviewMode(_)), // or ExitedReviewMode(None) as appropriate
-        tokio::time::Duration::from_secs(20),
-    )
-    .await;
-
-    let _complete = wait_for_event_with_timeout(
-        &codex,
-        |ev| matches!(ev, EventMsg::TaskComplete(_)),
-        tokio::time::Duration::from_secs(20),
-    )
-    .await;
+    let _entered = wait_for_event(&codex, |ev| matches!(ev, EventMsg::EnteredReviewMode(_))).await;
+    let _closed = wait_for_event(&codex, |ev| matches!(ev, EventMsg::ExitedReviewMode(None))).await;
+    let _complete = wait_for_event(&codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;
 
     // Assert the request `input` contains only the single review user message.
     let request = &server.received_requests().await.unwrap()[0];
@@ -528,8 +512,6 @@ async fn start_responses_server_with_sse(sse_raw: &str, expected_requests: usize
         .respond_with(
             ResponseTemplate::new(200)
                 .insert_header("content-type", "text/event-stream")
-                .insert_header("connection", "close")
-                .insert_header("content-length", sse.len().to_string())
                 .set_body_raw(sse.clone(), "text/event-stream"),
         )
         .expect(expected_requests as u64)
@@ -550,8 +532,6 @@ where
 {
     let model_provider = ModelProviderInfo {
         base_url: Some(format!("{}/v1", server.uri())),
-        // Give Windows runners breathing room.
-        stream_idle_timeout_ms: Some(if cfg!(windows) { 10_000 } else { 1_000 }),
         ..built_in_model_providers()["openai"].clone()
     };
     let mut config = load_default_config_for_test(codex_home);

From d6d52b5b873d400dd7d6a5524c125e2ab377982b Mon Sep 17 00:00:00 2001
From: Daniel Edrisian <dedrisian@openai.com>
Date: Sun, 14 Sep 2025 15:07:37 -0700
Subject: [PATCH 3/4] Clean up

---
 codex-rs/core/tests/suite/compact.rs | 23 ++---------------------
 codex-rs/core/tests/suite/review.rs  |  2 --
 2 files changed, 2 insertions(+), 23 deletions(-)

diff --git a/codex-rs/core/tests/suite/compact.rs b/codex-rs/core/tests/suite/compact.rs
index a8bd9bfbd47..fdf1a4e48bd 100644
--- a/codex-rs/core/tests/suite/compact.rs
+++ b/codex-rs/core/tests/suite/compact.rs
@@ -1,7 +1,6 @@
 #![expect(clippy::unwrap_used)]
 
 use codex_core::CodexAuth;
-use codex_core::CodexConversation;
 use codex_core::ConversationManager;
 use codex_core::ModelProviderInfo;
 use codex_core::NewConversation;
@@ -17,8 +16,6 @@ use core_test_support::load_default_config_for_test;
 use core_test_support::wait_for_event;
 use serde_json::Value;
 use tempfile::TempDir;
-use tokio::time::Duration;
-use tokio::time::timeout;
 use wiremock::BodyPrintLimit;
 use wiremock::Mock;
 use wiremock::MockServer;
@@ -129,22 +126,6 @@ async fn start_mock_server() -> MockServer {
         .await
 }
 
-async fn wait_for_non_auto_task_complete(codex: &CodexConversation, wait_time: Duration) {
-    loop {
-        let wait_budget = wait_time.max(Duration::from_secs(5));
-        let event = match timeout(wait_budget, codex.next_event()).await {
-            Ok(Ok(ev)) => ev,
-            Ok(Err(_)) => panic!("stream ended unexpectedly"),
-            Err(_) => panic!("timeout waiting for event"),
-        };
-        if let EventMsg::TaskComplete(_) = &event.msg
-            && !event.id.starts_with("auto-compact-")
-        {
-            break;
-        }
-    }
-}
-
 pub(super) const FIRST_REPLY: &str = "FIRST_REPLY";
 pub(super) const SUMMARY_TEXT: &str = "SUMMARY_ONLY_CONTEXT";
 pub(super) const SUMMARIZE_TRIGGER: &str = "Start Summarization";
@@ -474,7 +455,7 @@ async fn auto_compact_runs_after_token_limit_hit() {
         .await
         .unwrap();
 
-    wait_for_non_auto_task_complete(&codex, Duration::from_secs(20)).await;
+    wait_for_event(&codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;
 
     codex
         .submit(Op::UserInput {
@@ -485,7 +466,7 @@ async fn auto_compact_runs_after_token_limit_hit() {
         .await
         .unwrap();
 
-    wait_for_non_auto_task_complete(&codex, Duration::from_secs(20)).await;
+    wait_for_event(&codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;
     // wait_for_event(&codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await;
 
     let requests = server.received_requests().await.unwrap();
diff --git a/codex-rs/core/tests/suite/review.rs b/codex-rs/core/tests/suite/review.rs
index b79c59ba5f1..9891c536e70 100644
--- a/codex-rs/core/tests/suite/review.rs
+++ b/codex-rs/core/tests/suite/review.rs
@@ -16,13 +16,11 @@ use codex_core::spawn::CODEX_SANDBOX_NETWORK_DISABLED_ENV_VAR;
 use core_test_support::load_default_config_for_test;
 use core_test_support::load_sse_fixture_with_id_from_str;
 use core_test_support::wait_for_event;
-use core_test_support::wait_for_event_with_timeout;
 use pretty_assertions::assert_eq;
 use std::path::PathBuf;
 use std::sync::Arc;
 use tempfile::TempDir;
 use tokio::io::AsyncWriteExt as _;
-use tokio::time::Duration;
 use uuid::Uuid;
 use wiremock::Mock;
 use wiremock::MockServer;

From a898da72399e55d2f974d4a597c0e22a3999c4a9 Mon Sep 17 00:00:00 2001
From: Daniel Edrisian <dedrisian@openai.com>
Date: Sun, 14 Sep 2025 16:11:47 -0700
Subject: [PATCH 4/4] Add comments, and revert rust-ci changes.

---
 .github/workflows/rust-ci.yml        | 74 ++++++++++++++--------------
 codex-rs/core/tests/suite/compact.rs |  1 +
 codex-rs/core/tests/suite/review.rs  |  3 ++
 3 files changed, 40 insertions(+), 38 deletions(-)

diff --git a/.github/workflows/rust-ci.yml b/.github/workflows/rust-ci.yml
index 18ffe887999..280939c611d 100644
--- a/.github/workflows/rust-ci.yml
+++ b/.github/workflows/rust-ci.yml
@@ -85,7 +85,7 @@ jobs:
 
   # --- CI to validate on different os/targets --------------------------------
   lint_build_test:
-    name: ${{ matrix.runner }} - ${{ matrix.target }} [run ${{ matrix.repeat }}]${{ matrix.profile == 'release' && ' (release)' || '' }}
+    name: ${{ matrix.runner }} - ${{ matrix.target }}${{ matrix.profile == 'release' && ' (release)' || '' }}
     runs-on: ${{ matrix.runner }}
     timeout-minutes: 30
     needs: changed
@@ -99,49 +99,47 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          # - runner: macos-14
-          #   target: aarch64-apple-darwin
-          #   profile: dev
-          # - runner: macos-14
-          #   target: x86_64-apple-darwin
-          #   profile: dev
-          # - runner: ubuntu-24.04
-          #   target: x86_64-unknown-linux-musl
-          #   profile: dev
-          # - runner: ubuntu-24.04
-          #   target: x86_64-unknown-linux-gnu
-          #   profile: dev
-          # - runner: ubuntu-24.04-arm
-          #   target: aarch64-unknown-linux-musl
-          #   profile: dev
-          # - runner: ubuntu-24.04-arm
-          #   target: aarch64-unknown-linux-gnu
-          #   profile: dev
+          - runner: macos-14
+            target: aarch64-apple-darwin
+            profile: dev
+          - runner: macos-14
+            target: x86_64-apple-darwin
+            profile: dev
+          - runner: ubuntu-24.04
+            target: x86_64-unknown-linux-musl
+            profile: dev
+          - runner: ubuntu-24.04
+            target: x86_64-unknown-linux-gnu
+            profile: dev
+          - runner: ubuntu-24.04-arm
+            target: aarch64-unknown-linux-musl
+            profile: dev
+          - runner: ubuntu-24.04-arm
+            target: aarch64-unknown-linux-gnu
+            profile: dev
           - runner: windows-latest
             target: x86_64-pc-windows-msvc
             profile: dev
-          # - runner: windows-11-arm
-          #   target: aarch64-pc-windows-msvc
-          #   profile: dev
+          - runner: windows-11-arm
+            target: aarch64-pc-windows-msvc
+            profile: dev
 
           # Also run representative release builds on Mac and Linux because
           # there could be release-only build errors we want to catch.
           # Hopefully this also pre-populates the build cache to speed up
           # releases.
-          # - runner: macos-14
-          #   target: aarch64-apple-darwin
-          #   profile: release
-          # - runner: ubuntu-24.04
-          #   target: x86_64-unknown-linux-musl
-          #   profile: release
-          # - runner: windows-latest
-          #   target: x86_64-pc-windows-msvc
-          #   profile: release
-          # - runner: windows-11-arm
-          #   target: aarch64-pc-windows-msvc
-          #   profile: release
-
-        repeat: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]
+          - runner: macos-14
+            target: aarch64-apple-darwin
+            profile: release
+          - runner: ubuntu-24.04
+            target: x86_64-unknown-linux-musl
+            profile: release
+          - runner: windows-latest
+            target: x86_64-pc-windows-msvc
+            profile: release
+          - runner: windows-11-arm
+            target: aarch64-pc-windows-msvc
+            profile: release
 
     steps:
       - uses: actions/checkout@v5
@@ -167,7 +165,7 @@ jobs:
 
       - name: cargo clippy
         id: clippy
-        run: cargo clippy --target ${{ matrix.target }} -p codex-core --tests --profile ${{ matrix.profile }}
+        run: cargo clippy --target ${{ matrix.target }} --all-features --tests --profile ${{ matrix.profile }} -- -D warnings
 
       # Running `cargo build` from the workspace root builds the workspace using
       # the union of all features from third-party crates. This can mask errors
@@ -192,7 +190,7 @@ jobs:
         # Tests take too long for release builds to run them on every PR.
         if: ${{ matrix.profile != 'release' }}
         continue-on-error: true
-        run: cargo nextest run -p codex-core --no-fail-fast --target ${{ matrix.target }}
+        run: cargo nextest run --all-features --no-fail-fast --target ${{ matrix.target }}
         env:
           RUST_BACKTRACE: 1
 
diff --git a/codex-rs/core/tests/suite/compact.rs b/codex-rs/core/tests/suite/compact.rs
index fdf1a4e48bd..361315f7243 100644
--- a/codex-rs/core/tests/suite/compact.rs
+++ b/codex-rs/core/tests/suite/compact.rs
@@ -366,6 +366,7 @@ async fn summarize_context_three_requests_and_instructions() {
     );
 }
 
+// Windows CI only: bump to 4 workers to prevent SSE/event starvation and test timeouts.
 #[cfg_attr(windows, tokio::test(flavor = "multi_thread", worker_threads = 4))]
 #[cfg_attr(not(windows), tokio::test(flavor = "multi_thread", worker_threads = 2))]
 async fn auto_compact_runs_after_token_limit_hit() {
diff --git a/codex-rs/core/tests/suite/review.rs b/codex-rs/core/tests/suite/review.rs
index 9891c536e70..26e0f1107a5 100644
--- a/codex-rs/core/tests/suite/review.rs
+++ b/codex-rs/core/tests/suite/review.rs
@@ -118,6 +118,7 @@ async fn review_op_emits_lifecycle_and_review_output() {
 /// When the model returns plain text that is not JSON, ensure the child
 /// lifecycle still occurs and the plain text is surfaced via
 /// ExitedReviewMode(Some(..)) as the overall_explanation.
+// Windows CI only: bump to 4 workers to prevent SSE/event starvation and test timeouts.
 #[cfg_attr(windows, tokio::test(flavor = "multi_thread", worker_threads = 4))]
 #[cfg_attr(not(windows), tokio::test(flavor = "multi_thread", worker_threads = 2))]
 async fn review_op_with_plain_text_emits_review_fallback() {
@@ -169,6 +170,7 @@ async fn review_op_with_plain_text_emits_review_fallback() {
 
 /// When the model returns structured JSON in a review, ensure no AgentMessage
 /// is emitted; the UI consumes the structured result via ExitedReviewMode.
+// Windows CI only: bump to 4 workers to prevent SSE/event starvation and test timeouts.
 #[cfg_attr(windows, tokio::test(flavor = "multi_thread", worker_threads = 4))]
 #[cfg_attr(not(windows), tokio::test(flavor = "multi_thread", worker_threads = 2))]
 async fn review_does_not_emit_agent_message_on_structured_output() {
@@ -295,6 +297,7 @@ async fn review_uses_custom_review_model_from_config() {
 /// When a review session begins, it must not prepend prior chat history from
 /// the parent session. The request `input` should contain only the review
 /// prompt from the user.
+// Windows CI only: bump to 4 workers to prevent SSE/event starvation and test timeouts.
 #[cfg_attr(windows, tokio::test(flavor = "multi_thread", worker_threads = 4))]
 #[cfg_attr(not(windows), tokio::test(flavor = "multi_thread", worker_threads = 2))]
 async fn review_input_isolated_from_parent_history() {