Make output assertions more explicit (#4784)

Match using precise regexes.
This commit is contained in:
pakrym-oai
2025-10-05 16:01:38 -07:00
committed by GitHub
parent 77a8b7fdeb
commit b2d81a7cac
6 changed files with 95 additions and 81 deletions

1
codex-rs/Cargo.lock generated
View File

@@ -1575,6 +1575,7 @@ dependencies = [
"anyhow", "anyhow",
"assert_cmd", "assert_cmd",
"codex-core", "codex-core",
"regex-lite",
"serde_json", "serde_json",
"tempfile", "tempfile",
"tokio", "tokio",

View File

@@ -10,6 +10,7 @@ path = "lib.rs"
anyhow = { workspace = true } anyhow = { workspace = true }
assert_cmd = { workspace = true } assert_cmd = { workspace = true }
codex-core = { workspace = true } codex-core = { workspace = true }
regex-lite = { workspace = true }
serde_json = { workspace = true } serde_json = { workspace = true }
tempfile = { workspace = true } tempfile = { workspace = true }
tokio = { workspace = true, features = ["time"] } tokio = { workspace = true, features = ["time"] }

View File

@@ -6,6 +6,7 @@ use codex_core::CodexConversation;
use codex_core::config::Config; use codex_core::config::Config;
use codex_core::config::ConfigOverrides; use codex_core::config::ConfigOverrides;
use codex_core::config::ConfigToml; use codex_core::config::ConfigToml;
use regex_lite::Regex;
#[cfg(target_os = "linux")] #[cfg(target_os = "linux")]
use assert_cmd::cargo::cargo_bin; use assert_cmd::cargo::cargo_bin;
@@ -14,6 +15,16 @@ pub mod responses;
pub mod test_codex; pub mod test_codex;
pub mod test_codex_exec; pub mod test_codex_exec;
#[track_caller]
pub fn assert_regex_match<'s>(pattern: &str, actual: &'s str) -> regex_lite::Captures<'s> {
let regex = Regex::new(pattern).unwrap_or_else(|err| {
panic!("failed to compile regex {pattern:?}: {err}");
});
regex
.captures(actual)
.unwrap_or_else(|| panic!("regex {pattern:?} did not match {actual:?}"))
}
/// Returns a default `Config` whose on-disk state is confined to the provided /// Returns a default `Config` whose on-disk state is confined to the provided
/// temporary directory. Using a per-test directory keeps tests hermetic and /// temporary directory. Using a per-test directory keeps tests hermetic and
/// avoids clobbering a developers real `~/.codex`. /// avoids clobbering a developers real `~/.codex`.

View File

@@ -8,6 +8,7 @@ use codex_core::protocol::InputItem;
use codex_core::protocol::Op; use codex_core::protocol::Op;
use codex_core::protocol::SandboxPolicy; use codex_core::protocol::SandboxPolicy;
use codex_protocol::config_types::ReasoningSummary; use codex_protocol::config_types::ReasoningSummary;
use core_test_support::assert_regex_match;
use core_test_support::responses::ev_assistant_message; use core_test_support::responses::ev_assistant_message;
use core_test_support::responses::ev_completed; use core_test_support::responses::ev_completed;
use core_test_support::responses::ev_function_call; use core_test_support::responses::ev_function_call;
@@ -131,10 +132,7 @@ async fn shell_output_stays_json_without_freeform_apply_patch() -> Result<()> {
.get("output") .get("output")
.and_then(Value::as_str) .and_then(Value::as_str)
.unwrap_or_default(); .unwrap_or_default();
assert!( assert_regex_match(r"(?s)^shell json\n?$", stdout);
stdout.contains("shell json"),
"expected stdout to include command output, got {stdout:?}"
);
Ok(()) Ok(())
} }
@@ -190,18 +188,12 @@ async fn shell_output_is_structured_with_freeform_apply_patch() -> Result<()> {
serde_json::from_str::<Value>(output).is_err(), serde_json::from_str::<Value>(output).is_err(),
"expected structured shell output to be plain text", "expected structured shell output to be plain text",
); );
assert!( let expected_pattern = r"(?s)^Exit code: 0
output.starts_with("Exit code: 0\n"), Wall time: [0-9]+(?:\.[0-9]+)? seconds
"expected exit code prefix, got {output:?}", Output:
); freeform shell
assert!( ?$";
output.contains("\nOutput:\n"), assert_regex_match(expected_pattern, output);
"expected Output section, got {output:?}"
);
assert!(
output.contains("freeform shell"),
"expected stdout content, got {output:?}"
);
Ok(()) Ok(())
} }
@@ -259,18 +251,27 @@ async fn shell_output_reserializes_truncated_content() -> Result<()> {
serde_json::from_str::<Value>(output).is_err(), serde_json::from_str::<Value>(output).is_err(),
"expected truncated shell output to be plain text", "expected truncated shell output to be plain text",
); );
assert!( let truncated_pattern = r#"(?s)^Exit code: 0
output.starts_with("Exit code: 0\n"), Wall time: [0-9]+(?:\.[0-9]+)? seconds
"expected exit code prefix, got {output:?}", Total output lines: 400
); Output:
assert!( Total output lines: 400
output.lines().any(|line| line == "Total output lines: 400"),
"expected total output lines marker, got {output:?}", 1
); 2
assert!( 3
output.contains("[... omitted"), 4
"expected truncated marker, got {output:?}", 5
); 6
.*\[\.{3} omitted \d+ of 400 lines \.{3}\]
.*\n396
397
398
399
400
$"#;
assert_regex_match(truncated_pattern, output);
Ok(()) Ok(())
} }

View File

@@ -9,6 +9,7 @@ use codex_core::protocol::Op;
use codex_core::protocol::SandboxPolicy; use codex_core::protocol::SandboxPolicy;
use codex_protocol::config_types::ReasoningSummary; use codex_protocol::config_types::ReasoningSummary;
use codex_protocol::plan_tool::StepStatus; use codex_protocol::plan_tool::StepStatus;
use core_test_support::assert_regex_match;
use core_test_support::responses; use core_test_support::responses;
use core_test_support::responses::ev_apply_patch_function_call; use core_test_support::responses::ev_apply_patch_function_call;
use core_test_support::responses::ev_assistant_message; use core_test_support::responses::ev_assistant_message;
@@ -116,10 +117,7 @@ async fn shell_tool_executes_command_and_streams_output() -> anyhow::Result<()>
let exec_output: Value = serde_json::from_str(output_text)?; let exec_output: Value = serde_json::from_str(output_text)?;
assert_eq!(exec_output["metadata"]["exit_code"], 0); assert_eq!(exec_output["metadata"]["exit_code"], 0);
let stdout = exec_output["output"].as_str().expect("stdout field"); let stdout = exec_output["output"].as_str().expect("stdout field");
assert!( assert_regex_match(r"(?s)^tool harness\n?$", stdout);
stdout.contains("tool harness"),
"expected stdout to contain command output, got {stdout:?}"
);
Ok(()) Ok(())
} }

View File

@@ -9,6 +9,7 @@ use codex_core::protocol::InputItem;
use codex_core::protocol::Op; use codex_core::protocol::Op;
use codex_core::protocol::SandboxPolicy; use codex_core::protocol::SandboxPolicy;
use codex_protocol::config_types::ReasoningSummary; use codex_protocol::config_types::ReasoningSummary;
use core_test_support::assert_regex_match;
use core_test_support::responses::ev_assistant_message; use core_test_support::responses::ev_assistant_message;
use core_test_support::responses::ev_completed; use core_test_support::responses::ev_completed;
use core_test_support::responses::ev_custom_tool_call; use core_test_support::responses::ev_custom_tool_call;
@@ -21,6 +22,7 @@ use core_test_support::skip_if_no_network;
use core_test_support::test_codex::TestCodex; use core_test_support::test_codex::TestCodex;
use core_test_support::test_codex::test_codex; use core_test_support::test_codex::test_codex;
use core_test_support::wait_for_event; use core_test_support::wait_for_event;
use regex_lite::Regex;
use serde_json::Value; use serde_json::Value;
use serde_json::json; use serde_json::json;
use wiremock::Request; use wiremock::Request;
@@ -254,10 +256,8 @@ async fn shell_escalated_permissions_rejected_then_ok() -> Result<()> {
"expected exit code 0 after rerunning without escalation", "expected exit code 0 after rerunning without escalation",
); );
let stdout = output_json["output"].as_str().unwrap_or_default(); let stdout = output_json["output"].as_str().unwrap_or_default();
assert!( let stdout_pattern = r"(?s)^shell ok\n?$";
stdout.contains("shell ok"), assert_regex_match(stdout_pattern, stdout);
"expected stdout to include command output, got {stdout:?}"
);
Ok(()) Ok(())
} }
@@ -437,15 +437,15 @@ async fn shell_timeout_includes_timeout_prefix_and_metadata() -> Result<()> {
); );
let stdout = output_json["output"].as_str().unwrap_or_default(); let stdout = output_json["output"].as_str().unwrap_or_default();
assert!( let timeout_pattern = r"(?s)^Total output lines: \d+
stdout.contains("command timed out after "),
"expected timeout prefix, got {stdout:?}" command timed out after (?P<ms>\d+) milliseconds
); line
let third_line = stdout.lines().nth(2).unwrap_or_default(); .*$";
let duration_ms = third_line let captures = assert_regex_match(timeout_pattern, stdout);
.strip_prefix("command timed out after ") let duration_ms = captures
.and_then(|line| line.strip_suffix(" milliseconds")) .name("ms")
.and_then(|value| value.parse::<u64>().ok()) .and_then(|m| m.as_str().parse::<u64>().ok())
.unwrap_or_default(); .unwrap_or_default();
assert!( assert!(
duration_ms >= timeout_ms, duration_ms >= timeout_ms,
@@ -453,14 +453,8 @@ async fn shell_timeout_includes_timeout_prefix_and_metadata() -> Result<()> {
); );
} else { } else {
// Fallback: accept the signal classification path to deflake the test. // Fallback: accept the signal classification path to deflake the test.
assert!( let signal_pattern = r"(?is)^execution error:.*signal.*$";
output_str.contains("execution error"), assert_regex_match(signal_pattern, output_str);
"unexpected non-JSON output: {output_str:?}"
);
assert!(
output_str.contains("Signal(") || output_str.to_lowercase().contains("signal"),
"expected signal classification in error output, got {output_str:?}"
);
} }
Ok(()) Ok(())
@@ -518,30 +512,25 @@ async fn shell_sandbox_denied_truncates_error_output() -> Result<()> {
.and_then(Value::as_str) .and_then(Value::as_str)
.expect("denied output string"); .expect("denied output string");
assert!( let sandbox_pattern = r#"(?s)^Exit code: -?\d+
output.contains("failed in sandbox: "), Wall time: [0-9]+(?:\.[0-9]+)? seconds
"expected sandbox error prefix, got {output:?}" Total output lines: \d+
); Output:
assert!( Total output lines: \d+
output.contains("[... omitted"),
"expected truncated marker, got {output:?}" failed in sandbox: .*?(?:Operation not permitted|Permission denied|Read-only file system).*?
); \[\.{3} omitted \d+ of \d+ lines \.{3}\]
assert!( .*this is a long stderr line that should trigger truncation 0123456789abcdefghijklmnopqrstuvwxyz.*
output.contains(long_line), \n?$"#;
"expected truncated stderr sample, got {output:?}" let sandbox_regex = Regex::new(sandbox_pattern)?;
); if !sandbox_regex.is_match(output) {
// Linux distributions may surface sandbox write failures as different errno messages let fallback_pattern = r#"(?s)^Total output lines: \d+
// depending on the underlying mechanism (e.g., EPERM, EACCES, or EROFS). Accept a
// small set of common variants to keep this cross-platform. failed in sandbox: this is a long stderr line that should trigger truncation 0123456789abcdefghijklmnopqrstuvwxyz
let denial_markers = [ .*this is a long stderr line that should trigger truncation 0123456789abcdefghijklmnopqrstuvwxyz.*
"Operation not permitted", // EPERM .*(?:Operation not permitted|Permission denied|Read-only file system).*$"#;
"Permission denied", // EACCES assert_regex_match(fallback_pattern, output);
"Read-only file system", // EROFS }
];
assert!(
denial_markers.iter().any(|m| output.contains(m)),
"expected sandbox denial message, got {output:?}"
);
Ok(()) Ok(())
} }
@@ -604,10 +593,23 @@ async fn shell_spawn_failure_truncates_exec_error() -> Result<()> {
.and_then(Value::as_str) .and_then(Value::as_str)
.expect("spawn failure output string"); .expect("spawn failure output string");
assert!( let spawn_error_pattern = r#"(?s)^Exit code: -?\d+
output.contains("execution error:"), Wall time: [0-9]+(?:\.[0-9]+)? seconds
"expected execution error prefix, got {output:?}" Output:
); execution error: .*$"#;
let spawn_truncated_pattern = r#"(?s)^Exit code: -?\d+
Wall time: [0-9]+(?:\.[0-9]+)? seconds
Total output lines: \d+
Output:
Total output lines: \d+
execution error: .*$"#;
let spawn_error_regex = Regex::new(spawn_error_pattern)?;
let spawn_truncated_regex = Regex::new(spawn_truncated_pattern)?;
if !spawn_error_regex.is_match(output) && !spawn_truncated_regex.is_match(output) {
let fallback_pattern = r"(?s)^execution error: .*$";
assert_regex_match(fallback_pattern, output);
}
assert!(output.len() <= 10 * 1024); assert!(output.len() <= 10 * 1024);
Ok(()) Ok(())