Make output assertions more explicit (#4784)

Match using precise regexes.
This commit is contained in:
pakrym-oai
2025-10-05 16:01:38 -07:00
committed by GitHub
parent 77a8b7fdeb
commit b2d81a7cac
6 changed files with 95 additions and 81 deletions

1
codex-rs/Cargo.lock generated
View File

@@ -1575,6 +1575,7 @@ dependencies = [
"anyhow",
"assert_cmd",
"codex-core",
"regex-lite",
"serde_json",
"tempfile",
"tokio",

View File

@@ -10,6 +10,7 @@ path = "lib.rs"
anyhow = { workspace = true }
assert_cmd = { workspace = true }
codex-core = { workspace = true }
regex-lite = { workspace = true }
serde_json = { workspace = true }
tempfile = { workspace = true }
tokio = { workspace = true, features = ["time"] }

View File

@@ -6,6 +6,7 @@ use codex_core::CodexConversation;
use codex_core::config::Config;
use codex_core::config::ConfigOverrides;
use codex_core::config::ConfigToml;
use regex_lite::Regex;
#[cfg(target_os = "linux")]
use assert_cmd::cargo::cargo_bin;
@@ -14,6 +15,16 @@ pub mod responses;
pub mod test_codex;
pub mod test_codex_exec;
#[track_caller]
pub fn assert_regex_match<'s>(pattern: &str, actual: &'s str) -> regex_lite::Captures<'s> {
let regex = Regex::new(pattern).unwrap_or_else(|err| {
panic!("failed to compile regex {pattern:?}: {err}");
});
regex
.captures(actual)
.unwrap_or_else(|| panic!("regex {pattern:?} did not match {actual:?}"))
}
/// Returns a default `Config` whose on-disk state is confined to the provided
/// temporary directory. Using a per-test directory keeps tests hermetic and
/// avoids clobbering a developers real `~/.codex`.

View File

@@ -8,6 +8,7 @@ use codex_core::protocol::InputItem;
use codex_core::protocol::Op;
use codex_core::protocol::SandboxPolicy;
use codex_protocol::config_types::ReasoningSummary;
use core_test_support::assert_regex_match;
use core_test_support::responses::ev_assistant_message;
use core_test_support::responses::ev_completed;
use core_test_support::responses::ev_function_call;
@@ -131,10 +132,7 @@ async fn shell_output_stays_json_without_freeform_apply_patch() -> Result<()> {
.get("output")
.and_then(Value::as_str)
.unwrap_or_default();
assert!(
stdout.contains("shell json"),
"expected stdout to include command output, got {stdout:?}"
);
assert_regex_match(r"(?s)^shell json\n?$", stdout);
Ok(())
}
@@ -190,18 +188,12 @@ async fn shell_output_is_structured_with_freeform_apply_patch() -> Result<()> {
serde_json::from_str::<Value>(output).is_err(),
"expected structured shell output to be plain text",
);
assert!(
output.starts_with("Exit code: 0\n"),
"expected exit code prefix, got {output:?}",
);
assert!(
output.contains("\nOutput:\n"),
"expected Output section, got {output:?}"
);
assert!(
output.contains("freeform shell"),
"expected stdout content, got {output:?}"
);
let expected_pattern = r"(?s)^Exit code: 0
Wall time: [0-9]+(?:\.[0-9]+)? seconds
Output:
freeform shell
?$";
assert_regex_match(expected_pattern, output);
Ok(())
}
@@ -259,18 +251,27 @@ async fn shell_output_reserializes_truncated_content() -> Result<()> {
serde_json::from_str::<Value>(output).is_err(),
"expected truncated shell output to be plain text",
);
assert!(
output.starts_with("Exit code: 0\n"),
"expected exit code prefix, got {output:?}",
);
assert!(
output.lines().any(|line| line == "Total output lines: 400"),
"expected total output lines marker, got {output:?}",
);
assert!(
output.contains("[... omitted"),
"expected truncated marker, got {output:?}",
);
let truncated_pattern = r#"(?s)^Exit code: 0
Wall time: [0-9]+(?:\.[0-9]+)? seconds
Total output lines: 400
Output:
Total output lines: 400
1
2
3
4
5
6
.*\[\.{3} omitted \d+ of 400 lines \.{3}\]
.*\n396
397
398
399
400
$"#;
assert_regex_match(truncated_pattern, output);
Ok(())
}

View File

@@ -9,6 +9,7 @@ use codex_core::protocol::Op;
use codex_core::protocol::SandboxPolicy;
use codex_protocol::config_types::ReasoningSummary;
use codex_protocol::plan_tool::StepStatus;
use core_test_support::assert_regex_match;
use core_test_support::responses;
use core_test_support::responses::ev_apply_patch_function_call;
use core_test_support::responses::ev_assistant_message;
@@ -116,10 +117,7 @@ async fn shell_tool_executes_command_and_streams_output() -> anyhow::Result<()>
let exec_output: Value = serde_json::from_str(output_text)?;
assert_eq!(exec_output["metadata"]["exit_code"], 0);
let stdout = exec_output["output"].as_str().expect("stdout field");
assert!(
stdout.contains("tool harness"),
"expected stdout to contain command output, got {stdout:?}"
);
assert_regex_match(r"(?s)^tool harness\n?$", stdout);
Ok(())
}

View File

@@ -9,6 +9,7 @@ use codex_core::protocol::InputItem;
use codex_core::protocol::Op;
use codex_core::protocol::SandboxPolicy;
use codex_protocol::config_types::ReasoningSummary;
use core_test_support::assert_regex_match;
use core_test_support::responses::ev_assistant_message;
use core_test_support::responses::ev_completed;
use core_test_support::responses::ev_custom_tool_call;
@@ -21,6 +22,7 @@ use core_test_support::skip_if_no_network;
use core_test_support::test_codex::TestCodex;
use core_test_support::test_codex::test_codex;
use core_test_support::wait_for_event;
use regex_lite::Regex;
use serde_json::Value;
use serde_json::json;
use wiremock::Request;
@@ -254,10 +256,8 @@ async fn shell_escalated_permissions_rejected_then_ok() -> Result<()> {
"expected exit code 0 after rerunning without escalation",
);
let stdout = output_json["output"].as_str().unwrap_or_default();
assert!(
stdout.contains("shell ok"),
"expected stdout to include command output, got {stdout:?}"
);
let stdout_pattern = r"(?s)^shell ok\n?$";
assert_regex_match(stdout_pattern, stdout);
Ok(())
}
@@ -437,15 +437,15 @@ async fn shell_timeout_includes_timeout_prefix_and_metadata() -> Result<()> {
);
let stdout = output_json["output"].as_str().unwrap_or_default();
assert!(
stdout.contains("command timed out after "),
"expected timeout prefix, got {stdout:?}"
);
let third_line = stdout.lines().nth(2).unwrap_or_default();
let duration_ms = third_line
.strip_prefix("command timed out after ")
.and_then(|line| line.strip_suffix(" milliseconds"))
.and_then(|value| value.parse::<u64>().ok())
let timeout_pattern = r"(?s)^Total output lines: \d+
command timed out after (?P<ms>\d+) milliseconds
line
.*$";
let captures = assert_regex_match(timeout_pattern, stdout);
let duration_ms = captures
.name("ms")
.and_then(|m| m.as_str().parse::<u64>().ok())
.unwrap_or_default();
assert!(
duration_ms >= timeout_ms,
@@ -453,14 +453,8 @@ async fn shell_timeout_includes_timeout_prefix_and_metadata() -> Result<()> {
);
} else {
// Fallback: accept the signal classification path to deflake the test.
assert!(
output_str.contains("execution error"),
"unexpected non-JSON output: {output_str:?}"
);
assert!(
output_str.contains("Signal(") || output_str.to_lowercase().contains("signal"),
"expected signal classification in error output, got {output_str:?}"
);
let signal_pattern = r"(?is)^execution error:.*signal.*$";
assert_regex_match(signal_pattern, output_str);
}
Ok(())
@@ -518,30 +512,25 @@ async fn shell_sandbox_denied_truncates_error_output() -> Result<()> {
.and_then(Value::as_str)
.expect("denied output string");
assert!(
output.contains("failed in sandbox: "),
"expected sandbox error prefix, got {output:?}"
);
assert!(
output.contains("[... omitted"),
"expected truncated marker, got {output:?}"
);
assert!(
output.contains(long_line),
"expected truncated stderr sample, got {output:?}"
);
// Linux distributions may surface sandbox write failures as different errno messages
// depending on the underlying mechanism (e.g., EPERM, EACCES, or EROFS). Accept a
// small set of common variants to keep this cross-platform.
let denial_markers = [
"Operation not permitted", // EPERM
"Permission denied", // EACCES
"Read-only file system", // EROFS
];
assert!(
denial_markers.iter().any(|m| output.contains(m)),
"expected sandbox denial message, got {output:?}"
);
let sandbox_pattern = r#"(?s)^Exit code: -?\d+
Wall time: [0-9]+(?:\.[0-9]+)? seconds
Total output lines: \d+
Output:
Total output lines: \d+
failed in sandbox: .*?(?:Operation not permitted|Permission denied|Read-only file system).*?
\[\.{3} omitted \d+ of \d+ lines \.{3}\]
.*this is a long stderr line that should trigger truncation 0123456789abcdefghijklmnopqrstuvwxyz.*
\n?$"#;
let sandbox_regex = Regex::new(sandbox_pattern)?;
if !sandbox_regex.is_match(output) {
let fallback_pattern = r#"(?s)^Total output lines: \d+
failed in sandbox: this is a long stderr line that should trigger truncation 0123456789abcdefghijklmnopqrstuvwxyz
.*this is a long stderr line that should trigger truncation 0123456789abcdefghijklmnopqrstuvwxyz.*
.*(?:Operation not permitted|Permission denied|Read-only file system).*$"#;
assert_regex_match(fallback_pattern, output);
}
Ok(())
}
@@ -604,10 +593,23 @@ async fn shell_spawn_failure_truncates_exec_error() -> Result<()> {
.and_then(Value::as_str)
.expect("spawn failure output string");
assert!(
output.contains("execution error:"),
"expected execution error prefix, got {output:?}"
);
let spawn_error_pattern = r#"(?s)^Exit code: -?\d+
Wall time: [0-9]+(?:\.[0-9]+)? seconds
Output:
execution error: .*$"#;
let spawn_truncated_pattern = r#"(?s)^Exit code: -?\d+
Wall time: [0-9]+(?:\.[0-9]+)? seconds
Total output lines: \d+
Output:
Total output lines: \d+
execution error: .*$"#;
let spawn_error_regex = Regex::new(spawn_error_pattern)?;
let spawn_truncated_regex = Regex::new(spawn_truncated_pattern)?;
if !spawn_error_regex.is_match(output) && !spawn_truncated_regex.is_match(output) {
let fallback_pattern = r"(?s)^execution error: .*$";
assert_regex_match(fallback_pattern, output);
}
assert!(output.len() <= 10 * 1024);
Ok(())