Make output assertions more explicit (#4784)

Match using precise regexes.
2025-10-05 16:01:38 -07:00
parent 77a8b7fdeb
commit b2d81a7cac
6 changed files with 95 additions and 81 deletions
--- a/codex-rs/Cargo.lock
+++ b/codex-rs/Cargo.lock
@@ -1575,6 +1575,7 @@ dependencies = [
 "anyhow",
 "assert_cmd",
 "codex-core",
 "regex-lite",
 "serde_json",
 "tempfile",
 "tokio",
--- a/codex-rs/core/tests/common/Cargo.toml
+++ b/codex-rs/core/tests/common/Cargo.toml
@@ -10,6 +10,7 @@ path = "lib.rs"
 anyhow = { workspace = true }
 assert_cmd = { workspace = true }
 codex-core = { workspace = true }
 regex-lite = { workspace = true }
 serde_json = { workspace = true }
 tempfile = { workspace = true }
 tokio = { workspace = true, features = ["time"] }
--- a/codex-rs/core/tests/common/lib.rs
+++ b/codex-rs/core/tests/common/lib.rs
@@ -6,6 +6,7 @@ use codex_core::CodexConversation;
 use codex_core::config::Config;
 use codex_core::config::ConfigOverrides;
 use codex_core::config::ConfigToml;
 use regex_lite::Regex;
 #[cfg(target_os = "linux")]
 use assert_cmd::cargo::cargo_bin;
@@ -14,6 +15,16 @@ pub mod responses;
 pub mod test_codex;
 pub mod test_codex_exec;
 #[track_caller]
 pub fn assert_regex_match<'s>(pattern: &str, actual: &'s str) -> regex_lite::Captures<'s> {
    let regex = Regex::new(pattern).unwrap_or_else(|err| {
        panic!("failed to compile regex {pattern:?}: {err}");
    });
    regex
        .captures(actual)
        .unwrap_or_else(|| panic!("regex {pattern:?} did not match {actual:?}"))
 }
 /// Returns a default `Config` whose on-disk state is confined to the provided
 /// temporary directory. Using a per-test directory keeps tests hermetic and
 /// avoids clobbering a developer’s real `~/.codex`.
--- a/codex-rs/core/tests/suite/shell_serialization.rs
+++ b/codex-rs/core/tests/suite/shell_serialization.rs
@@ -8,6 +8,7 @@ use codex_core::protocol::InputItem;
 use codex_core::protocol::Op;
 use codex_core::protocol::SandboxPolicy;
 use codex_protocol::config_types::ReasoningSummary;
 use core_test_support::assert_regex_match;
 use core_test_support::responses::ev_assistant_message;
 use core_test_support::responses::ev_completed;
 use core_test_support::responses::ev_function_call;
@@ -131,10 +132,7 @@ async fn shell_output_stays_json_without_freeform_apply_patch() -> Result<()> {
        .get("output")
        .and_then(Value::as_str)
        .unwrap_or_default();
-    assert!(
+    assert_regex_match(r"(?s)^shell json\n?$", stdout);
        stdout.contains("shell json"),
        "expected stdout to include command output, got {stdout:?}"
    );
    Ok(())
 }
@@ -190,18 +188,12 @@ async fn shell_output_is_structured_with_freeform_apply_patch() -> Result<()> {
        serde_json::from_str::<Value>(output).is_err(),
        "expected structured shell output to be plain text",
    );
-    assert!(
+    let expected_pattern = r"(?s)^Exit code: 0
-        output.starts_with("Exit code: 0\n"),
+Wall time: [0-9]+(?:\.[0-9]+)? seconds
-        "expected exit code prefix, got {output:?}",
+Output:
-    );
+freeform shell
-    assert!(
+?$";
-        output.contains("\nOutput:\n"),
+    assert_regex_match(expected_pattern, output);
        "expected Output section, got {output:?}"
    );
    assert!(
        output.contains("freeform shell"),
        "expected stdout content, got {output:?}"
    );
    Ok(())
 }
@@ -259,18 +251,27 @@ async fn shell_output_reserializes_truncated_content() -> Result<()> {
        serde_json::from_str::<Value>(output).is_err(),
        "expected truncated shell output to be plain text",
    );
-    assert!(
+    let truncated_pattern = r#"(?s)^Exit code: 0
-        output.starts_with("Exit code: 0\n"),
+Wall time: [0-9]+(?:\.[0-9]+)? seconds
-        "expected exit code prefix, got {output:?}",
+Total output lines: 400
-    );
+Output:
-    assert!(
+Total output lines: 400
-        output.lines().any(|line| line == "Total output lines: 400"),
+
-        "expected total output lines marker, got {output:?}",
+1
-    );
+2
-    assert!(
+3
-        output.contains("[... omitted"),
+4
-        "expected truncated marker, got {output:?}",
+5
-    );
+6
 .*\[\.{3} omitted \d+ of 400 lines \.{3}\]
 .*\n396
 397
 398
 399
 400
 $"#;
    assert_regex_match(truncated_pattern, output);
    Ok(())
 }
--- a/codex-rs/core/tests/suite/tool_harness.rs
+++ b/codex-rs/core/tests/suite/tool_harness.rs
@@ -9,6 +9,7 @@ use codex_core::protocol::Op;
 use codex_core::protocol::SandboxPolicy;
 use codex_protocol::config_types::ReasoningSummary;
 use codex_protocol::plan_tool::StepStatus;
 use core_test_support::assert_regex_match;
 use core_test_support::responses;
 use core_test_support::responses::ev_apply_patch_function_call;
 use core_test_support::responses::ev_assistant_message;
@@ -116,10 +117,7 @@ async fn shell_tool_executes_command_and_streams_output() -> anyhow::Result<()>
    let exec_output: Value = serde_json::from_str(output_text)?;
    assert_eq!(exec_output["metadata"]["exit_code"], 0);
    let stdout = exec_output["output"].as_str().expect("stdout field");
-    assert!(
+    assert_regex_match(r"(?s)^tool harness\n?$", stdout);
        stdout.contains("tool harness"),
        "expected stdout to contain command output, got {stdout:?}"
    );
    Ok(())
 }
--- a/codex-rs/core/tests/suite/tools.rs
+++ b/codex-rs/core/tests/suite/tools.rs
@@ -9,6 +9,7 @@ use codex_core::protocol::InputItem;
 use codex_core::protocol::Op;
 use codex_core::protocol::SandboxPolicy;
 use codex_protocol::config_types::ReasoningSummary;
 use core_test_support::assert_regex_match;
 use core_test_support::responses::ev_assistant_message;
 use core_test_support::responses::ev_completed;
 use core_test_support::responses::ev_custom_tool_call;
@@ -21,6 +22,7 @@ use core_test_support::skip_if_no_network;
 use core_test_support::test_codex::TestCodex;
 use core_test_support::test_codex::test_codex;
 use core_test_support::wait_for_event;
 use regex_lite::Regex;
 use serde_json::Value;
 use serde_json::json;
 use wiremock::Request;
@@ -254,10 +256,8 @@ async fn shell_escalated_permissions_rejected_then_ok() -> Result<()> {
        "expected exit code 0 after rerunning without escalation",
    );
    let stdout = output_json["output"].as_str().unwrap_or_default();
-    assert!(
+    let stdout_pattern = r"(?s)^shell ok\n?$";
-        stdout.contains("shell ok"),
+    assert_regex_match(stdout_pattern, stdout);
        "expected stdout to include command output, got {stdout:?}"
    );
    Ok(())
 }
@@ -437,15 +437,15 @@ async fn shell_timeout_includes_timeout_prefix_and_metadata() -> Result<()> {
        );
        let stdout = output_json["output"].as_str().unwrap_or_default();
-        assert!(
+        let timeout_pattern = r"(?s)^Total output lines: \d+
-            stdout.contains("command timed out after "),
+
-            "expected timeout prefix, got {stdout:?}"
+command timed out after (?P<ms>\d+) milliseconds
-        );
+line
-        let third_line = stdout.lines().nth(2).unwrap_or_default();
+.*$";
-        let duration_ms = third_line
+        let captures = assert_regex_match(timeout_pattern, stdout);
-            .strip_prefix("command timed out after ")
+        let duration_ms = captures
-            .and_then(|line| line.strip_suffix(" milliseconds"))
+            .name("ms")
-            .and_then(|value| value.parse::<u64>().ok())
+            .and_then(|m| m.as_str().parse::<u64>().ok())
            .unwrap_or_default();
        assert!(
            duration_ms >= timeout_ms,
@@ -453,14 +453,8 @@ async fn shell_timeout_includes_timeout_prefix_and_metadata() -> Result<()> {
        );
    } else {
        // Fallback: accept the signal classification path to deflake the test.
-        assert!(
+        let signal_pattern = r"(?is)^execution error:.*signal.*$";
-            output_str.contains("execution error"),
+        assert_regex_match(signal_pattern, output_str);
            "unexpected non-JSON output: {output_str:?}"
        );
        assert!(
            output_str.contains("Signal(") || output_str.to_lowercase().contains("signal"),
            "expected signal classification in error output, got {output_str:?}"
        );
    }
    Ok(())
@@ -518,30 +512,25 @@ async fn shell_sandbox_denied_truncates_error_output() -> Result<()> {
        .and_then(Value::as_str)
        .expect("denied output string");
-    assert!(
+    let sandbox_pattern = r#"(?s)^Exit code: -?\d+
-        output.contains("failed in sandbox: "),
+Wall time: [0-9]+(?:\.[0-9]+)? seconds
-        "expected sandbox error prefix, got {output:?}"
+Total output lines: \d+
-    );
+Output:
-    assert!(
+Total output lines: \d+
-        output.contains("[... omitted"),
+
-        "expected truncated marker, got {output:?}"
+failed in sandbox: .*?(?:Operation not permitted|Permission denied|Read-only file system).*?
-    );
+\[\.{3} omitted \d+ of \d+ lines \.{3}\]
-    assert!(
+.*this is a long stderr line that should trigger truncation 0123456789abcdefghijklmnopqrstuvwxyz.*
-        output.contains(long_line),
+\n?$"#;
-        "expected truncated stderr sample, got {output:?}"
+    let sandbox_regex = Regex::new(sandbox_pattern)?;
-    );
+    if !sandbox_regex.is_match(output) {
-    // Linux distributions may surface sandbox write failures as different errno messages
+        let fallback_pattern = r#"(?s)^Total output lines: \d+
-    // depending on the underlying mechanism (e.g., EPERM, EACCES, or EROFS). Accept a
+
-    // small set of common variants to keep this cross-platform.
+failed in sandbox: this is a long stderr line that should trigger truncation 0123456789abcdefghijklmnopqrstuvwxyz
-    let denial_markers = [
+.*this is a long stderr line that should trigger truncation 0123456789abcdefghijklmnopqrstuvwxyz.*
-        "Operation not permitted", // EPERM
+.*(?:Operation not permitted|Permission denied|Read-only file system).*$"#;
-        "Permission denied",       // EACCES
+        assert_regex_match(fallback_pattern, output);
-        "Read-only file system",   // EROFS
+    }
    ];
    assert!(
        denial_markers.iter().any(|m| output.contains(m)),
        "expected sandbox denial message, got {output:?}"
    );
    Ok(())
 }
@@ -604,10 +593,23 @@ async fn shell_spawn_failure_truncates_exec_error() -> Result<()> {
        .and_then(Value::as_str)
        .expect("spawn failure output string");
-    assert!(
+    let spawn_error_pattern = r#"(?s)^Exit code: -?\d+
-        output.contains("execution error:"),
+Wall time: [0-9]+(?:\.[0-9]+)? seconds
-        "expected execution error prefix, got {output:?}"
+Output:
-    );
+execution error: .*$"#;
    let spawn_truncated_pattern = r#"(?s)^Exit code: -?\d+
 Wall time: [0-9]+(?:\.[0-9]+)? seconds
 Total output lines: \d+
 Output:
 Total output lines: \d+
 execution error: .*$"#;
    let spawn_error_regex = Regex::new(spawn_error_pattern)?;
    let spawn_truncated_regex = Regex::new(spawn_truncated_pattern)?;
    if !spawn_error_regex.is_match(output) && !spawn_truncated_regex.is_match(output) {
        let fallback_pattern = r"(?s)^execution error: .*$";
        assert_regex_match(fallback_pattern, output);
    }
    assert!(output.len() <= 10 * 1024);
    Ok(())