Phase 1: Repository & Infrastructure Setup

- Renamed directories: codex-rs -> llmx-rs, codex-cli -> llmx-cli - Updated package.json files: - Root: llmx-monorepo - CLI: @llmx/llmx - SDK: @llmx/llmx-sdk - Updated pnpm workspace configuration - Renamed binary: codex.js -> llmx.js - Updated environment variables: CODEX_* -> LLMX_* - Changed repository URLs to valknar/llmx 🤖 Generated with Claude Code
2025-11-11 14:01:52 +01:00
parent 052b052832
commit f237fe560d
1151 changed files with 41 additions and 35 deletions
--- a/llmx-rs/core/tests/common/Cargo.toml
+++ b/llmx-rs/core/tests/common/Cargo.toml
@@ -0,0 +1,20 @@
+[package]
+edition = "2024"
+name = "core_test_support"
+version = { workspace = true }
+
+[lib]
+path = "lib.rs"
+
+[dependencies]
+anyhow = { workspace = true }
+assert_cmd = { workspace = true }
+codex-core = { workspace = true }
+codex-protocol = { workspace = true }
+notify = { workspace = true }
+regex-lite = { workspace = true }
+serde_json = { workspace = true }
+tempfile = { workspace = true }
+tokio = { workspace = true, features = ["time"] }
+walkdir = { workspace = true }
+wiremock = { workspace = true }
--- a/llmx-rs/core/tests/common/lib.rs
+++ b/llmx-rs/core/tests/common/lib.rs
@@ -0,0 +1,362 @@
+#![expect(clippy::expect_used)]
+
+use tempfile::TempDir;
+
+use codex_core::CodexConversation;
+use codex_core::config::Config;
+use codex_core::config::ConfigOverrides;
+use codex_core::config::ConfigToml;
+use regex_lite::Regex;
+
+#[cfg(target_os = "linux")]
+use assert_cmd::cargo::cargo_bin;
+
+pub mod responses;
+pub mod test_codex;
+pub mod test_codex_exec;
+
+#[track_caller]
+pub fn assert_regex_match<'s>(pattern: &str, actual: &'s str) -> regex_lite::Captures<'s> {
+    let regex = Regex::new(pattern).unwrap_or_else(|err| {
+        panic!("failed to compile regex {pattern:?}: {err}");
+    });
+    regex
+        .captures(actual)
+        .unwrap_or_else(|| panic!("regex {pattern:?} did not match {actual:?}"))
+}
+
+/// Returns a default `Config` whose on-disk state is confined to the provided
+/// temporary directory. Using a per-test directory keeps tests hermetic and
+/// avoids clobbering a developer’s real `~/.codex`.
+pub fn load_default_config_for_test(codex_home: &TempDir) -> Config {
+    Config::load_from_base_config_with_overrides(
+        ConfigToml::default(),
+        default_test_overrides(),
+        codex_home.path().to_path_buf(),
+    )
+    .expect("defaults for test should always succeed")
+}
+
+#[cfg(target_os = "linux")]
+fn default_test_overrides() -> ConfigOverrides {
+    ConfigOverrides {
+        codex_linux_sandbox_exe: Some(cargo_bin("codex-linux-sandbox")),
+        ..ConfigOverrides::default()
+    }
+}
+
+#[cfg(not(target_os = "linux"))]
+fn default_test_overrides() -> ConfigOverrides {
+    ConfigOverrides::default()
+}
+
+/// Builds an SSE stream body from a JSON fixture.
+///
+/// The fixture must contain an array of objects where each object represents a
+/// single SSE event with at least a `type` field matching the `event:` value.
+/// Additional fields become the JSON payload for the `data:` line. An object
+/// with only a `type` field results in an event with no `data:` section. This
+/// makes it trivial to extend the fixtures as OpenAI adds new event kinds or
+/// fields.
+pub fn load_sse_fixture(path: impl AsRef<std::path::Path>) -> String {
+    let events: Vec<serde_json::Value> =
+        serde_json::from_reader(std::fs::File::open(path).expect("read fixture"))
+            .expect("parse JSON fixture");
+    events
+        .into_iter()
+        .map(|e| {
+            let kind = e
+                .get("type")
+                .and_then(|v| v.as_str())
+                .expect("fixture event missing type");
+            if e.as_object().map(|o| o.len() == 1).unwrap_or(false) {
+                format!("event: {kind}\n\n")
+            } else {
+                format!("event: {kind}\ndata: {e}\n\n")
+            }
+        })
+        .collect()
+}
+
+pub fn load_sse_fixture_with_id_from_str(raw: &str, id: &str) -> String {
+    let replaced = raw.replace("__ID__", id);
+    let events: Vec<serde_json::Value> =
+        serde_json::from_str(&replaced).expect("parse JSON fixture");
+    events
+        .into_iter()
+        .map(|e| {
+            let kind = e
+                .get("type")
+                .and_then(|v| v.as_str())
+                .expect("fixture event missing type");
+            if e.as_object().map(|o| o.len() == 1).unwrap_or(false) {
+                format!("event: {kind}\n\n")
+            } else {
+                format!("event: {kind}\ndata: {e}\n\n")
+            }
+        })
+        .collect()
+}
+
+/// Same as [`load_sse_fixture`], but replaces the placeholder `__ID__` in the
+/// fixture template with the supplied identifier before parsing. This lets a
+/// single JSON template be reused by multiple tests that each need a unique
+/// `response_id`.
+pub fn load_sse_fixture_with_id(path: impl AsRef<std::path::Path>, id: &str) -> String {
+    let raw = std::fs::read_to_string(path).expect("read fixture template");
+    let replaced = raw.replace("__ID__", id);
+    let events: Vec<serde_json::Value> =
+        serde_json::from_str(&replaced).expect("parse JSON fixture");
+    events
+        .into_iter()
+        .map(|e| {
+            let kind = e
+                .get("type")
+                .and_then(|v| v.as_str())
+                .expect("fixture event missing type");
+            if e.as_object().map(|o| o.len() == 1).unwrap_or(false) {
+                format!("event: {kind}\n\n")
+            } else {
+                format!("event: {kind}\ndata: {e}\n\n")
+            }
+        })
+        .collect()
+}
+
+pub async fn wait_for_event<F>(
+    codex: &CodexConversation,
+    predicate: F,
+) -> codex_core::protocol::EventMsg
+where
+    F: FnMut(&codex_core::protocol::EventMsg) -> bool,
+{
+    use tokio::time::Duration;
+    wait_for_event_with_timeout(codex, predicate, Duration::from_secs(1)).await
+}
+
+pub async fn wait_for_event_match<T, F>(codex: &CodexConversation, matcher: F) -> T
+where
+    F: Fn(&codex_core::protocol::EventMsg) -> Option<T>,
+{
+    let ev = wait_for_event(codex, |ev| matcher(ev).is_some()).await;
+    matcher(&ev).unwrap()
+}
+
+pub async fn wait_for_event_with_timeout<F>(
+    codex: &CodexConversation,
+    mut predicate: F,
+    wait_time: tokio::time::Duration,
+) -> codex_core::protocol::EventMsg
+where
+    F: FnMut(&codex_core::protocol::EventMsg) -> bool,
+{
+    use tokio::time::Duration;
+    use tokio::time::timeout;
+    loop {
+        // Allow a bit more time to accommodate async startup work (e.g. config IO, tool discovery)
+        let ev = timeout(wait_time.max(Duration::from_secs(5)), codex.next_event())
+            .await
+            .expect("timeout waiting for event")
+            .expect("stream ended unexpectedly");
+        if predicate(&ev.msg) {
+            return ev.msg;
+        }
+    }
+}
+
+pub fn sandbox_env_var() -> &'static str {
+    codex_core::spawn::CODEX_SANDBOX_ENV_VAR
+}
+
+pub fn sandbox_network_env_var() -> &'static str {
+    codex_core::spawn::CODEX_SANDBOX_NETWORK_DISABLED_ENV_VAR
+}
+
+pub mod fs_wait {
+    use anyhow::Result;
+    use anyhow::anyhow;
+    use notify::RecursiveMode;
+    use notify::Watcher;
+    use std::path::Path;
+    use std::path::PathBuf;
+    use std::sync::mpsc;
+    use std::sync::mpsc::RecvTimeoutError;
+    use std::time::Duration;
+    use std::time::Instant;
+    use tokio::task;
+    use walkdir::WalkDir;
+
+    pub async fn wait_for_path_exists(
+        path: impl Into<PathBuf>,
+        timeout: Duration,
+    ) -> Result<PathBuf> {
+        let path = path.into();
+        task::spawn_blocking(move || wait_for_path_exists_blocking(path, timeout)).await?
+    }
+
+    pub async fn wait_for_matching_file(
+        root: impl Into<PathBuf>,
+        timeout: Duration,
+        predicate: impl FnMut(&Path) -> bool + Send + 'static,
+    ) -> Result<PathBuf> {
+        let root = root.into();
+        task::spawn_blocking(move || {
+            let mut predicate = predicate;
+            blocking_find_matching_file(root, timeout, &mut predicate)
+        })
+        .await?
+    }
+
+    fn wait_for_path_exists_blocking(path: PathBuf, timeout: Duration) -> Result<PathBuf> {
+        if path.exists() {
+            return Ok(path);
+        }
+
+        let watch_root = nearest_existing_ancestor(&path);
+        let (tx, rx) = mpsc::channel();
+        let mut watcher = notify::recommended_watcher(move |res| {
+            let _ = tx.send(res);
+        })?;
+        watcher.watch(&watch_root, RecursiveMode::Recursive)?;
+
+        let deadline = Instant::now() + timeout;
+        loop {
+            if path.exists() {
+                return Ok(path.clone());
+            }
+            let now = Instant::now();
+            if now >= deadline {
+                break;
+            }
+            let remaining = deadline.saturating_duration_since(now);
+            match rx.recv_timeout(remaining) {
+                Ok(Ok(_event)) => {
+                    if path.exists() {
+                        return Ok(path.clone());
+                    }
+                }
+                Ok(Err(err)) => return Err(err.into()),
+                Err(RecvTimeoutError::Timeout) => break,
+                Err(RecvTimeoutError::Disconnected) => break,
+            }
+        }
+
+        if path.exists() {
+            Ok(path)
+        } else {
+            Err(anyhow!("timed out waiting for {path:?}"))
+        }
+    }
+
+    fn blocking_find_matching_file(
+        root: PathBuf,
+        timeout: Duration,
+        predicate: &mut impl FnMut(&Path) -> bool,
+    ) -> Result<PathBuf> {
+        let root = wait_for_path_exists_blocking(root, timeout)?;
+
+        if let Some(found) = scan_for_match(&root, predicate) {
+            return Ok(found);
+        }
+
+        let (tx, rx) = mpsc::channel();
+        let mut watcher = notify::recommended_watcher(move |res| {
+            let _ = tx.send(res);
+        })?;
+        watcher.watch(&root, RecursiveMode::Recursive)?;
+
+        let deadline = Instant::now() + timeout;
+
+        while Instant::now() < deadline {
+            let remaining = deadline.saturating_duration_since(Instant::now());
+            match rx.recv_timeout(remaining) {
+                Ok(Ok(_event)) => {
+                    if let Some(found) = scan_for_match(&root, predicate) {
+                        return Ok(found);
+                    }
+                }
+                Ok(Err(err)) => return Err(err.into()),
+                Err(RecvTimeoutError::Timeout) => break,
+                Err(RecvTimeoutError::Disconnected) => break,
+            }
+        }
+
+        if let Some(found) = scan_for_match(&root, predicate) {
+            Ok(found)
+        } else {
+            Err(anyhow!("timed out waiting for matching file in {root:?}"))
+        }
+    }
+
+    fn scan_for_match(root: &Path, predicate: &mut impl FnMut(&Path) -> bool) -> Option<PathBuf> {
+        for entry in WalkDir::new(root).into_iter().filter_map(Result::ok) {
+            let path = entry.path();
+            if !entry.file_type().is_file() {
+                continue;
+            }
+            if predicate(path) {
+                return Some(path.to_path_buf());
+            }
+        }
+        None
+    }
+
+    fn nearest_existing_ancestor(path: &Path) -> PathBuf {
+        let mut current = path;
+        loop {
+            if current.exists() {
+                return current.to_path_buf();
+            }
+            match current.parent() {
+                Some(parent) => current = parent,
+                None => return PathBuf::from("."),
+            }
+        }
+    }
+}
+
+#[macro_export]
+macro_rules! skip_if_sandbox {
+    () => {{
+        if ::std::env::var($crate::sandbox_env_var())
+            == ::core::result::Result::Ok("seatbelt".to_string())
+        {
+            eprintln!(
+                "{} is set to 'seatbelt', skipping test.",
+                $crate::sandbox_env_var()
+            );
+            return;
+        }
+    }};
+    ($return_value:expr $(,)?) => {{
+        if ::std::env::var($crate::sandbox_env_var())
+            == ::core::result::Result::Ok("seatbelt".to_string())
+        {
+            eprintln!(
+                "{} is set to 'seatbelt', skipping test.",
+                $crate::sandbox_env_var()
+            );
+            return $return_value;
+        }
+    }};
+}
+
+#[macro_export]
+macro_rules! skip_if_no_network {
+    () => {{
+        if ::std::env::var($crate::sandbox_network_env_var()).is_ok() {
+            println!(
+                "Skipping test because it cannot execute when network is disabled in a Codex sandbox."
+            );
+            return;
+        }
+    }};
+    ($return_value:expr $(,)?) => {{
+        if ::std::env::var($crate::sandbox_network_env_var()).is_ok() {
+            println!(
+                "Skipping test because it cannot execute when network is disabled in a Codex sandbox."
+            );
+            return $return_value;
+        }
+    }};
+}
--- a/llmx-rs/core/tests/common/responses.rs
+++ b/llmx-rs/core/tests/common/responses.rs
@@ -0,0 +1,587 @@
+use std::sync::Arc;
+use std::sync::Mutex;
+
+use serde_json::Value;
+use wiremock::BodyPrintLimit;
+use wiremock::Match;
+use wiremock::Mock;
+use wiremock::MockBuilder;
+use wiremock::MockServer;
+use wiremock::Respond;
+use wiremock::ResponseTemplate;
+use wiremock::matchers::method;
+use wiremock::matchers::path_regex;
+
+#[derive(Debug, Clone)]
+pub struct ResponseMock {
+    requests: Arc<Mutex<Vec<ResponsesRequest>>>,
+}
+
+impl ResponseMock {
+    fn new() -> Self {
+        Self {
+            requests: Arc::new(Mutex::new(Vec::new())),
+        }
+    }
+
+    pub fn single_request(&self) -> ResponsesRequest {
+        let requests = self.requests.lock().unwrap();
+        if requests.len() != 1 {
+            panic!("expected 1 request, got {}", requests.len());
+        }
+        requests.first().unwrap().clone()
+    }
+
+    pub fn requests(&self) -> Vec<ResponsesRequest> {
+        self.requests.lock().unwrap().clone()
+    }
+
+    /// Returns true if any captured request contains a `function_call` with the
+    /// provided `call_id`.
+    pub fn saw_function_call(&self, call_id: &str) -> bool {
+        self.requests()
+            .iter()
+            .any(|req| req.has_function_call(call_id))
+    }
+
+    /// Returns the `output` string for a matching `function_call_output` with
+    /// the provided `call_id`, searching across all captured requests.
+    pub fn function_call_output_text(&self, call_id: &str) -> Option<String> {
+        self.requests()
+            .iter()
+            .find_map(|req| req.function_call_output_text(call_id))
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct ResponsesRequest(wiremock::Request);
+
+impl ResponsesRequest {
+    pub fn body_json(&self) -> Value {
+        self.0.body_json().unwrap()
+    }
+
+    /// Returns all `input_text` spans from `message` inputs for the provided role.
+    pub fn message_input_texts(&self, role: &str) -> Vec<String> {
+        self.inputs_of_type("message")
+            .into_iter()
+            .filter(|item| item.get("role").and_then(Value::as_str) == Some(role))
+            .filter_map(|item| item.get("content").and_then(Value::as_array).cloned())
+            .flatten()
+            .filter(|span| span.get("type").and_then(Value::as_str) == Some("input_text"))
+            .filter_map(|span| span.get("text").and_then(Value::as_str).map(str::to_owned))
+            .collect()
+    }
+
+    pub fn input(&self) -> Vec<Value> {
+        self.0.body_json::<Value>().unwrap()["input"]
+            .as_array()
+            .expect("input array not found in request")
+            .clone()
+    }
+
+    pub fn inputs_of_type(&self, ty: &str) -> Vec<Value> {
+        self.input()
+            .iter()
+            .filter(|item| item.get("type").and_then(Value::as_str) == Some(ty))
+            .cloned()
+            .collect()
+    }
+
+    pub fn function_call_output(&self, call_id: &str) -> Value {
+        self.call_output(call_id, "function_call_output")
+    }
+
+    pub fn custom_tool_call_output(&self, call_id: &str) -> Value {
+        self.call_output(call_id, "custom_tool_call_output")
+    }
+
+    pub fn call_output(&self, call_id: &str, call_type: &str) -> Value {
+        self.input()
+            .iter()
+            .find(|item| {
+                item.get("type").unwrap() == call_type && item.get("call_id").unwrap() == call_id
+            })
+            .cloned()
+            .unwrap_or_else(|| panic!("function call output {call_id} item not found in request"))
+    }
+
+    /// Returns true if this request's `input` contains a `function_call` with
+    /// the specified `call_id`.
+    pub fn has_function_call(&self, call_id: &str) -> bool {
+        self.input().iter().any(|item| {
+            item.get("type").and_then(Value::as_str) == Some("function_call")
+                && item.get("call_id").and_then(Value::as_str) == Some(call_id)
+        })
+    }
+
+    /// If present, returns the `output` string of the `function_call_output`
+    /// entry matching `call_id` in this request's `input`.
+    pub fn function_call_output_text(&self, call_id: &str) -> Option<String> {
+        let binding = self.input();
+        let item = binding.iter().find(|item| {
+            item.get("type").and_then(Value::as_str) == Some("function_call_output")
+                && item.get("call_id").and_then(Value::as_str) == Some(call_id)
+        })?;
+        item.get("output")
+            .and_then(Value::as_str)
+            .map(str::to_string)
+    }
+
+    pub fn header(&self, name: &str) -> Option<String> {
+        self.0
+            .headers
+            .get(name)
+            .and_then(|v| v.to_str().ok())
+            .map(str::to_string)
+    }
+
+    pub fn path(&self) -> String {
+        self.0.url.path().to_string()
+    }
+
+    pub fn query_param(&self, name: &str) -> Option<String> {
+        self.0
+            .url
+            .query_pairs()
+            .find(|(k, _)| k == name)
+            .map(|(_, v)| v.to_string())
+    }
+}
+
+impl Match for ResponseMock {
+    fn matches(&self, request: &wiremock::Request) -> bool {
+        self.requests
+            .lock()
+            .unwrap()
+            .push(ResponsesRequest(request.clone()));
+
+        // Enforce invariant checks on every request body captured by the mock.
+        // Panic on orphan tool outputs or calls to catch regressions early.
+        validate_request_body_invariants(request);
+        true
+    }
+}
+
+/// Build an SSE stream body from a list of JSON events.
+pub fn sse(events: Vec<Value>) -> String {
+    use std::fmt::Write as _;
+    let mut out = String::new();
+    for ev in events {
+        let kind = ev.get("type").and_then(|v| v.as_str()).unwrap();
+        writeln!(&mut out, "event: {kind}").unwrap();
+        if !ev.as_object().map(|o| o.len() == 1).unwrap_or(false) {
+            write!(&mut out, "data: {ev}\n\n").unwrap();
+        } else {
+            out.push('\n');
+        }
+    }
+    out
+}
+
+/// Convenience: SSE event for a completed response with a specific id.
+pub fn ev_completed(id: &str) -> Value {
+    serde_json::json!({
+        "type": "response.completed",
+        "response": {
+            "id": id,
+            "usage": {"input_tokens":0,"input_tokens_details":null,"output_tokens":0,"output_tokens_details":null,"total_tokens":0}
+        }
+    })
+}
+
+/// Convenience: SSE event for a created response with a specific id.
+pub fn ev_response_created(id: &str) -> Value {
+    serde_json::json!({
+        "type": "response.created",
+        "response": {
+            "id": id,
+        }
+    })
+}
+
+pub fn ev_completed_with_tokens(id: &str, total_tokens: i64) -> Value {
+    serde_json::json!({
+        "type": "response.completed",
+        "response": {
+            "id": id,
+            "usage": {
+                "input_tokens": total_tokens,
+                "input_tokens_details": null,
+                "output_tokens": 0,
+                "output_tokens_details": null,
+                "total_tokens": total_tokens
+            }
+        }
+    })
+}
+
+/// Convenience: SSE event for a single assistant message output item.
+pub fn ev_assistant_message(id: &str, text: &str) -> Value {
+    serde_json::json!({
+        "type": "response.output_item.done",
+        "item": {
+            "type": "message",
+            "role": "assistant",
+            "id": id,
+            "content": [{"type": "output_text", "text": text}]
+        }
+    })
+}
+
+pub fn ev_message_item_added(id: &str, text: &str) -> Value {
+    serde_json::json!({
+        "type": "response.output_item.added",
+        "item": {
+            "type": "message",
+            "role": "assistant",
+            "id": id,
+            "content": [{"type": "output_text", "text": text}]
+        }
+    })
+}
+
+pub fn ev_output_text_delta(delta: &str) -> Value {
+    serde_json::json!({
+        "type": "response.output_text.delta",
+        "delta": delta,
+    })
+}
+
+pub fn ev_reasoning_item(id: &str, summary: &[&str], raw_content: &[&str]) -> Value {
+    let summary_entries: Vec<Value> = summary
+        .iter()
+        .map(|text| serde_json::json!({"type": "summary_text", "text": text}))
+        .collect();
+
+    let mut event = serde_json::json!({
+        "type": "response.output_item.done",
+        "item": {
+            "type": "reasoning",
+            "id": id,
+            "summary": summary_entries,
+        }
+    });
+
+    if !raw_content.is_empty() {
+        let content_entries: Vec<Value> = raw_content
+            .iter()
+            .map(|text| serde_json::json!({"type": "reasoning_text", "text": text}))
+            .collect();
+        event["item"]["content"] = Value::Array(content_entries);
+    }
+
+    event
+}
+
+pub fn ev_reasoning_item_added(id: &str, summary: &[&str]) -> Value {
+    let summary_entries: Vec<Value> = summary
+        .iter()
+        .map(|text| serde_json::json!({"type": "summary_text", "text": text}))
+        .collect();
+
+    serde_json::json!({
+        "type": "response.output_item.added",
+        "item": {
+            "type": "reasoning",
+            "id": id,
+            "summary": summary_entries,
+        }
+    })
+}
+
+pub fn ev_reasoning_summary_text_delta(delta: &str) -> Value {
+    serde_json::json!({
+        "type": "response.reasoning_summary_text.delta",
+        "delta": delta,
+    })
+}
+
+pub fn ev_reasoning_text_delta(delta: &str) -> Value {
+    serde_json::json!({
+        "type": "response.reasoning_text.delta",
+        "delta": delta,
+    })
+}
+
+pub fn ev_web_search_call_added(id: &str, status: &str, query: &str) -> Value {
+    serde_json::json!({
+        "type": "response.output_item.added",
+        "item": {
+            "type": "web_search_call",
+            "id": id,
+            "status": status,
+            "action": {"type": "search", "query": query}
+        }
+    })
+}
+
+pub fn ev_web_search_call_done(id: &str, status: &str, query: &str) -> Value {
+    serde_json::json!({
+        "type": "response.output_item.done",
+        "item": {
+            "type": "web_search_call",
+            "id": id,
+            "status": status,
+            "action": {"type": "search", "query": query}
+        }
+    })
+}
+
+pub fn ev_function_call(call_id: &str, name: &str, arguments: &str) -> Value {
+    serde_json::json!({
+        "type": "response.output_item.done",
+        "item": {
+            "type": "function_call",
+            "call_id": call_id,
+            "name": name,
+            "arguments": arguments
+        }
+    })
+}
+
+pub fn ev_custom_tool_call(call_id: &str, name: &str, input: &str) -> Value {
+    serde_json::json!({
+        "type": "response.output_item.done",
+        "item": {
+            "type": "custom_tool_call",
+            "call_id": call_id,
+            "name": name,
+            "input": input
+        }
+    })
+}
+
+pub fn ev_local_shell_call(call_id: &str, status: &str, command: Vec<&str>) -> Value {
+    serde_json::json!({
+        "type": "response.output_item.done",
+        "item": {
+            "type": "local_shell_call",
+            "call_id": call_id,
+            "status": status,
+            "action": {
+                "type": "exec",
+                "command": command,
+            }
+        }
+    })
+}
+
+/// Convenience: SSE event for an `apply_patch` custom tool call with raw patch
+/// text. This mirrors the payload produced by the Responses API when the model
+/// invokes `apply_patch` directly (before we convert it to a function call).
+pub fn ev_apply_patch_custom_tool_call(call_id: &str, patch: &str) -> Value {
+    serde_json::json!({
+        "type": "response.output_item.done",
+        "item": {
+            "type": "custom_tool_call",
+            "name": "apply_patch",
+            "input": patch,
+            "call_id": call_id
+        }
+    })
+}
+
+/// Convenience: SSE event for an `apply_patch` function call. The Responses API
+/// wraps the patch content in a JSON string under the `input` key; we recreate
+/// the same structure so downstream code exercises the full parsing path.
+pub fn ev_apply_patch_function_call(call_id: &str, patch: &str) -> Value {
+    let arguments = serde_json::json!({ "input": patch });
+    let arguments = serde_json::to_string(&arguments).expect("serialize apply_patch arguments");
+
+    serde_json::json!({
+        "type": "response.output_item.done",
+        "item": {
+            "type": "function_call",
+            "name": "apply_patch",
+            "arguments": arguments,
+            "call_id": call_id
+        }
+    })
+}
+
+pub fn sse_failed(id: &str, code: &str, message: &str) -> String {
+    sse(vec![serde_json::json!({
+        "type": "response.failed",
+        "response": {
+            "id": id,
+            "error": {"code": code, "message": message}
+        }
+    })])
+}
+
+pub fn sse_response(body: String) -> ResponseTemplate {
+    ResponseTemplate::new(200)
+        .insert_header("content-type", "text/event-stream")
+        .set_body_raw(body, "text/event-stream")
+}
+
+fn base_mock() -> (MockBuilder, ResponseMock) {
+    let response_mock = ResponseMock::new();
+    let mock = Mock::given(method("POST"))
+        .and(path_regex(".*/responses$"))
+        .and(response_mock.clone());
+    (mock, response_mock)
+}
+
+pub async fn mount_sse_once_match<M>(server: &MockServer, matcher: M, body: String) -> ResponseMock
+where
+    M: wiremock::Match + Send + Sync + 'static,
+{
+    let (mock, response_mock) = base_mock();
+    mock.and(matcher)
+        .respond_with(sse_response(body))
+        .up_to_n_times(1)
+        .mount(server)
+        .await;
+    response_mock
+}
+
+pub async fn mount_sse_once(server: &MockServer, body: String) -> ResponseMock {
+    let (mock, response_mock) = base_mock();
+    mock.respond_with(sse_response(body))
+        .up_to_n_times(1)
+        .mount(server)
+        .await;
+    response_mock
+}
+
+pub async fn mount_sse(server: &MockServer, body: String) -> ResponseMock {
+    let (mock, response_mock) = base_mock();
+    mock.respond_with(sse_response(body)).mount(server).await;
+    response_mock
+}
+
+pub async fn start_mock_server() -> MockServer {
+    MockServer::builder()
+        .body_print_limit(BodyPrintLimit::Limited(80_000))
+        .start()
+        .await
+}
+
+/// Mounts a sequence of SSE response bodies and serves them in order for each
+/// POST to `/v1/responses`. Panics if more requests are received than bodies
+/// provided. Also asserts the exact number of expected calls.
+pub async fn mount_sse_sequence(server: &MockServer, bodies: Vec<String>) -> ResponseMock {
+    use std::sync::atomic::AtomicUsize;
+    use std::sync::atomic::Ordering;
+
+    struct SeqResponder {
+        num_calls: AtomicUsize,
+        responses: Vec<String>,
+    }
+
+    impl Respond for SeqResponder {
+        fn respond(&self, _: &wiremock::Request) -> ResponseTemplate {
+            let call_num = self.num_calls.fetch_add(1, Ordering::SeqCst);
+            match self.responses.get(call_num) {
+                Some(body) => ResponseTemplate::new(200)
+                    .insert_header("content-type", "text/event-stream")
+                    .set_body_string(body.clone()),
+                None => panic!("no response for {call_num}"),
+            }
+        }
+    }
+
+    let num_calls = bodies.len();
+    let responder = SeqResponder {
+        num_calls: AtomicUsize::new(0),
+        responses: bodies,
+    };
+
+    let (mock, response_mock) = base_mock();
+    mock.respond_with(responder)
+        .up_to_n_times(num_calls as u64)
+        .expect(num_calls as u64)
+        .mount(server)
+        .await;
+
+    response_mock
+}
+
+/// Validate invariants on the request body sent to `/v1/responses`.
+///
+/// - No `function_call_output`/`custom_tool_call_output` with missing/empty `call_id`.
+/// - Every `function_call_output` must match a prior `function_call` or
+///   `local_shell_call` with the same `call_id` in the same `input`.
+/// - Every `custom_tool_call_output` must match a prior `custom_tool_call`.
+/// - Additionally, enforce symmetry: every `function_call`/`custom_tool_call`
+///   in the `input` must have a matching output entry.
+fn validate_request_body_invariants(request: &wiremock::Request) {
+    let Ok(body): Result<Value, _> = request.body_json() else {
+        return;
+    };
+    let Some(items) = body.get("input").and_then(Value::as_array) else {
+        panic!("input array not found in request");
+    };
+
+    use std::collections::HashSet;
+
+    fn get_call_id(item: &Value) -> Option<&str> {
+        item.get("call_id")
+            .and_then(Value::as_str)
+            .filter(|id| !id.is_empty())
+    }
+
+    fn gather_ids(items: &[Value], kind: &str) -> HashSet<String> {
+        items
+            .iter()
+            .filter(|item| item.get("type").and_then(Value::as_str) == Some(kind))
+            .filter_map(get_call_id)
+            .map(str::to_string)
+            .collect()
+    }
+
+    fn gather_output_ids(items: &[Value], kind: &str, missing_msg: &str) -> HashSet<String> {
+        items
+            .iter()
+            .filter(|item| item.get("type").and_then(Value::as_str) == Some(kind))
+            .map(|item| {
+                let Some(id) = get_call_id(item) else {
+                    panic!("{missing_msg}");
+                };
+                id.to_string()
+            })
+            .collect()
+    }
+
+    let function_calls = gather_ids(items, "function_call");
+    let custom_tool_calls = gather_ids(items, "custom_tool_call");
+    let local_shell_calls = gather_ids(items, "local_shell_call");
+    let function_call_outputs = gather_output_ids(
+        items,
+        "function_call_output",
+        "orphan function_call_output with empty call_id should be dropped",
+    );
+    let custom_tool_call_outputs = gather_output_ids(
+        items,
+        "custom_tool_call_output",
+        "orphan custom_tool_call_output with empty call_id should be dropped",
+    );
+
+    for cid in &function_call_outputs {
+        assert!(
+            function_calls.contains(cid) || local_shell_calls.contains(cid),
+            "function_call_output without matching call in input: {cid}",
+        );
+    }
+    for cid in &custom_tool_call_outputs {
+        assert!(
+            custom_tool_calls.contains(cid),
+            "custom_tool_call_output without matching call in input: {cid}",
+        );
+    }
+
+    for cid in &function_calls {
+        assert!(
+            function_call_outputs.contains(cid),
+            "Function call output is missing for call id: {cid}",
+        );
+    }
+    for cid in &custom_tool_calls {
+        assert!(
+            custom_tool_call_outputs.contains(cid),
+            "Custom tool call output is missing for call id: {cid}",
+        );
+    }
+}
--- a/llmx-rs/core/tests/common/test_codex.rs
+++ b/llmx-rs/core/tests/common/test_codex.rs
@@ -0,0 +1,288 @@
+use std::mem::swap;
+use std::path::Path;
+use std::path::PathBuf;
+use std::sync::Arc;
+
+use anyhow::Result;
+use codex_core::CodexAuth;
+use codex_core::CodexConversation;
+use codex_core::ConversationManager;
+use codex_core::ModelProviderInfo;
+use codex_core::built_in_model_providers;
+use codex_core::config::Config;
+use codex_core::features::Feature;
+use codex_core::protocol::AskForApproval;
+use codex_core::protocol::EventMsg;
+use codex_core::protocol::Op;
+use codex_core::protocol::SandboxPolicy;
+use codex_core::protocol::SessionConfiguredEvent;
+use codex_protocol::config_types::ReasoningSummary;
+use codex_protocol::user_input::UserInput;
+use serde_json::Value;
+use tempfile::TempDir;
+use wiremock::MockServer;
+
+use crate::load_default_config_for_test;
+use crate::responses::start_mock_server;
+use crate::wait_for_event;
+
+type ConfigMutator = dyn FnOnce(&mut Config) + Send;
+
+pub struct TestCodexBuilder {
+    config_mutators: Vec<Box<ConfigMutator>>,
+}
+
+impl TestCodexBuilder {
+    pub fn with_config<T>(mut self, mutator: T) -> Self
+    where
+        T: FnOnce(&mut Config) + Send + 'static,
+    {
+        self.config_mutators.push(Box::new(mutator));
+        self
+    }
+
+    pub async fn build(&mut self, server: &wiremock::MockServer) -> anyhow::Result<TestCodex> {
+        let home = Arc::new(TempDir::new()?);
+        self.build_with_home(server, home, None).await
+    }
+
+    pub async fn resume(
+        &mut self,
+        server: &wiremock::MockServer,
+        home: Arc<TempDir>,
+        rollout_path: PathBuf,
+    ) -> anyhow::Result<TestCodex> {
+        self.build_with_home(server, home, Some(rollout_path)).await
+    }
+
+    async fn build_with_home(
+        &mut self,
+        server: &wiremock::MockServer,
+        home: Arc<TempDir>,
+        resume_from: Option<PathBuf>,
+    ) -> anyhow::Result<TestCodex> {
+        let (config, cwd) = self.prepare_config(server, &home).await?;
+        let conversation_manager = ConversationManager::with_auth(CodexAuth::from_api_key("dummy"));
+
+        let new_conversation = match resume_from {
+            Some(path) => {
+                let auth_manager = codex_core::AuthManager::from_auth_for_testing(
+                    CodexAuth::from_api_key("dummy"),
+                );
+                conversation_manager
+                    .resume_conversation_from_rollout(config, path, auth_manager)
+                    .await?
+            }
+            None => conversation_manager.new_conversation(config).await?,
+        };
+
+        Ok(TestCodex {
+            home,
+            cwd,
+            codex: new_conversation.conversation,
+            session_configured: new_conversation.session_configured,
+        })
+    }
+
+    async fn prepare_config(
+        &mut self,
+        server: &wiremock::MockServer,
+        home: &TempDir,
+    ) -> anyhow::Result<(Config, Arc<TempDir>)> {
+        let model_provider = ModelProviderInfo {
+            base_url: Some(format!("{}/v1", server.uri())),
+            ..built_in_model_providers()["openai"].clone()
+        };
+        let cwd = Arc::new(TempDir::new()?);
+        let mut config = load_default_config_for_test(home);
+        config.cwd = cwd.path().to_path_buf();
+        config.model_provider = model_provider;
+        if let Ok(cmd) = assert_cmd::Command::cargo_bin("codex") {
+            config.codex_linux_sandbox_exe = Some(PathBuf::from(cmd.get_program().to_os_string()));
+        }
+
+        let mut mutators = vec![];
+        swap(&mut self.config_mutators, &mut mutators);
+        for mutator in mutators {
+            mutator(&mut config);
+        }
+
+        if config.include_apply_patch_tool {
+            config.features.enable(Feature::ApplyPatchFreeform);
+        } else {
+            config.features.disable(Feature::ApplyPatchFreeform);
+        }
+
+        Ok((config, cwd))
+    }
+}
+
+pub struct TestCodex {
+    pub home: Arc<TempDir>,
+    pub cwd: Arc<TempDir>,
+    pub codex: Arc<CodexConversation>,
+    pub session_configured: SessionConfiguredEvent,
+}
+
+impl TestCodex {
+    pub fn cwd_path(&self) -> &Path {
+        self.cwd.path()
+    }
+
+    pub fn workspace_path(&self, rel: impl AsRef<Path>) -> PathBuf {
+        self.cwd_path().join(rel)
+    }
+
+    pub async fn submit_turn(&self, prompt: &str) -> Result<()> {
+        self.submit_turn_with_policy(prompt, SandboxPolicy::DangerFullAccess)
+            .await
+    }
+
+    pub async fn submit_turn_with_policy(
+        &self,
+        prompt: &str,
+        sandbox_policy: SandboxPolicy,
+    ) -> Result<()> {
+        let session_model = self.session_configured.model.clone();
+        self.codex
+            .submit(Op::UserTurn {
+                items: vec![UserInput::Text {
+                    text: prompt.into(),
+                }],
+                final_output_json_schema: None,
+                cwd: self.cwd.path().to_path_buf(),
+                approval_policy: AskForApproval::Never,
+                sandbox_policy,
+                model: session_model,
+                effort: None,
+                summary: ReasoningSummary::Auto,
+            })
+            .await?;
+
+        wait_for_event(&self.codex, |event| {
+            matches!(event, EventMsg::TaskComplete(_))
+        })
+        .await;
+        Ok(())
+    }
+}
+
+pub struct TestCodexHarness {
+    server: MockServer,
+    test: TestCodex,
+}
+
+impl TestCodexHarness {
+    pub async fn new() -> Result<Self> {
+        Self::with_builder(test_codex()).await
+    }
+
+    pub async fn with_config(mutator: impl FnOnce(&mut Config) + Send + 'static) -> Result<Self> {
+        Self::with_builder(test_codex().with_config(mutator)).await
+    }
+
+    pub async fn with_builder(mut builder: TestCodexBuilder) -> Result<Self> {
+        let server = start_mock_server().await;
+        let test = builder.build(&server).await?;
+        Ok(Self { server, test })
+    }
+
+    pub fn server(&self) -> &MockServer {
+        &self.server
+    }
+
+    pub fn test(&self) -> &TestCodex {
+        &self.test
+    }
+
+    pub fn cwd(&self) -> &Path {
+        self.test.cwd_path()
+    }
+
+    pub fn path(&self, rel: impl AsRef<Path>) -> PathBuf {
+        self.test.workspace_path(rel)
+    }
+
+    pub async fn submit(&self, prompt: &str) -> Result<()> {
+        self.test.submit_turn(prompt).await
+    }
+
+    pub async fn submit_with_policy(
+        &self,
+        prompt: &str,
+        sandbox_policy: SandboxPolicy,
+    ) -> Result<()> {
+        self.test
+            .submit_turn_with_policy(prompt, sandbox_policy)
+            .await
+    }
+
+    pub async fn request_bodies(&self) -> Vec<Value> {
+        self.server
+            .received_requests()
+            .await
+            .expect("requests")
+            .into_iter()
+            .map(|req| serde_json::from_slice(&req.body).expect("request body json"))
+            .collect()
+    }
+
+    pub async fn function_call_output_value(&self, call_id: &str) -> Value {
+        let bodies = self.request_bodies().await;
+        function_call_output(&bodies, call_id).clone()
+    }
+
+    pub async fn function_call_stdout(&self, call_id: &str) -> String {
+        self.function_call_output_value(call_id)
+            .await
+            .get("output")
+            .and_then(Value::as_str)
+            .expect("output string")
+            .to_string()
+    }
+
+    pub async fn custom_tool_call_output(&self, call_id: &str) -> String {
+        let bodies = self.request_bodies().await;
+        custom_tool_call_output(&bodies, call_id)
+            .get("output")
+            .and_then(Value::as_str)
+            .expect("output string")
+            .to_string()
+    }
+}
+
+fn custom_tool_call_output<'a>(bodies: &'a [Value], call_id: &str) -> &'a Value {
+    for body in bodies {
+        if let Some(items) = body.get("input").and_then(Value::as_array) {
+            for item in items {
+                if item.get("type").and_then(Value::as_str) == Some("custom_tool_call_output")
+                    && item.get("call_id").and_then(Value::as_str) == Some(call_id)
+                {
+                    return item;
+                }
+            }
+        }
+    }
+    panic!("custom_tool_call_output {call_id} not found");
+}
+
+fn function_call_output<'a>(bodies: &'a [Value], call_id: &str) -> &'a Value {
+    for body in bodies {
+        if let Some(items) = body.get("input").and_then(Value::as_array) {
+            for item in items {
+                if item.get("type").and_then(Value::as_str) == Some("function_call_output")
+                    && item.get("call_id").and_then(Value::as_str) == Some(call_id)
+                {
+                    return item;
+                }
+            }
+        }
+    }
+    panic!("function_call_output {call_id} not found");
+}
+
+pub fn test_codex() -> TestCodexBuilder {
+    TestCodexBuilder {
+        config_mutators: vec![],
+    }
+}
--- a/llmx-rs/core/tests/common/test_codex_exec.rs
+++ b/llmx-rs/core/tests/common/test_codex_exec.rs
@@ -0,0 +1,41 @@
+#![allow(clippy::expect_used)]
+use codex_core::auth::CODEX_API_KEY_ENV_VAR;
+use std::path::Path;
+use tempfile::TempDir;
+use wiremock::MockServer;
+
+pub struct TestCodexExecBuilder {
+    home: TempDir,
+    cwd: TempDir,
+}
+
+impl TestCodexExecBuilder {
+    pub fn cmd(&self) -> assert_cmd::Command {
+        let mut cmd = assert_cmd::Command::cargo_bin("codex-exec")
+            .expect("should find binary for codex-exec");
+        cmd.current_dir(self.cwd.path())
+            .env("CODEX_HOME", self.home.path())
+            .env(CODEX_API_KEY_ENV_VAR, "dummy");
+        cmd
+    }
+    pub fn cmd_with_server(&self, server: &MockServer) -> assert_cmd::Command {
+        let mut cmd = self.cmd();
+        let base = format!("{}/v1", server.uri());
+        cmd.env("OPENAI_BASE_URL", base);
+        cmd
+    }
+
+    pub fn cwd_path(&self) -> &Path {
+        self.cwd.path()
+    }
+    pub fn home_path(&self) -> &Path {
+        self.home.path()
+    }
+}
+
+pub fn test_codex_exec() -> TestCodexExecBuilder {
+    TestCodexExecBuilder {
+        home: TempDir::new().expect("create temp home"),
+        cwd: TempDir::new().expect("create temp cwd"),
+    }
+}