From b84a920067aa87aea2cb0a7002050aa64648b324 Mon Sep 17 00:00:00 2001 From: jif-oai Date: Tue, 23 Sep 2025 17:59:17 +0100 Subject: [PATCH] chore: compact do not modify instructions (#4088) Keep the developer instruction and insert the summarisation message as a user message instead --- codex-rs/core/src/codex.rs | 9 +- codex-rs/core/src/codex/compact.rs | 31 ++----- codex-rs/core/tests/suite/compact.rs | 86 +++++++++++++------ .../core/tests/suite/compact_resume_fork.rs | 24 +++--- 4 files changed, 81 insertions(+), 69 deletions(-) diff --git a/codex-rs/core/src/codex.rs b/codex-rs/core/src/codex.rs index b89ebc73..49ab6940 100644 --- a/codex-rs/core/src/codex.rs +++ b/codex-rs/core/src/codex.rs @@ -1117,16 +1117,13 @@ impl AgentTask { turn_context: Arc, sub_id: String, input: Vec, - compact_instructions: String, ) -> Self { let handle = { let sess = sess.clone(); let sub_id = sub_id.clone(); let tc = Arc::clone(&turn_context); - tokio::spawn(async move { - compact::run_compact_task(sess, tc, sub_id, input, compact_instructions).await - }) - .abort_handle() + tokio::spawn(async move { compact::run_compact_task(sess, tc, sub_id, input).await }) + .abort_handle() }; Self { sess, @@ -1434,7 +1431,7 @@ async fn submission_loop( // Attempt to inject input into current task if let Err(items) = sess .inject_input(vec![InputItem::Text { - text: compact::COMPACT_TRIGGER_TEXT.to_string(), + text: compact::SUMMARIZATION_PROMPT.to_string(), }]) .await { diff --git a/codex-rs/core/src/codex/compact.rs b/codex-rs/core/src/codex/compact.rs index d1547a48..a60466b9 100644 --- a/codex-rs/core/src/codex/compact.rs +++ b/codex-rs/core/src/codex/compact.rs @@ -27,8 +27,7 @@ use codex_protocol::models::ResponseItem; use codex_protocol::protocol::RolloutItem; use futures::prelude::*; -pub(super) const COMPACT_TRIGGER_TEXT: &str = "Start Summarization"; -const SUMMARIZATION_PROMPT: &str = include_str!("../../templates/compact/prompt.md"); +pub const SUMMARIZATION_PROMPT: &str = include_str!("../../templates/compact/prompt.md"); const COMPACT_USER_MESSAGE_MAX_TOKENS: usize = 20_000; #[derive(Template)] @@ -44,13 +43,7 @@ pub(super) async fn spawn_compact_task( sub_id: String, input: Vec, ) { - let task = AgentTask::compact( - sess.clone(), - turn_context, - sub_id, - input, - SUMMARIZATION_PROMPT.to_string(), - ); + let task = AgentTask::compact(sess.clone(), turn_context, sub_id, input); sess.set_task(task).await; } @@ -60,17 +53,9 @@ pub(super) async fn run_inline_auto_compact_task( ) { let sub_id = sess.next_internal_sub_id(); let input = vec![InputItem::Text { - text: COMPACT_TRIGGER_TEXT.to_string(), + text: SUMMARIZATION_PROMPT.to_string(), }]; - run_compact_task_inner( - sess, - turn_context, - sub_id, - input, - SUMMARIZATION_PROMPT.to_string(), - false, - ) - .await; + run_compact_task_inner(sess, turn_context, sub_id, input, None, false).await; } pub(super) async fn run_compact_task( @@ -78,7 +63,6 @@ pub(super) async fn run_compact_task( turn_context: Arc, sub_id: String, input: Vec, - compact_instructions: String, ) { let start_event = Event { id: sub_id.clone(), @@ -92,7 +76,7 @@ pub(super) async fn run_compact_task( turn_context, sub_id.clone(), input, - compact_instructions, + None, true, ) .await; @@ -110,11 +94,10 @@ async fn run_compact_task_inner( turn_context: Arc, sub_id: String, input: Vec, - compact_instructions: String, + instructions_override: Option, remove_task_on_completion: bool, ) { let initial_input_for_turn: ResponseInputItem = ResponseInputItem::from(input); - let instructions_override = compact_instructions; let turn_input = sess .turn_input_with_history(vec![initial_input_for_turn.clone().into()]) .await; @@ -122,7 +105,7 @@ async fn run_compact_task_inner( let prompt = Prompt { input: turn_input, tools: Vec::new(), - base_instructions_override: Some(instructions_override), + base_instructions_override: instructions_override, }; let max_retries = turn_context.client.get_provider().stream_max_retries(); diff --git a/codex-rs/core/tests/suite/compact.rs b/codex-rs/core/tests/suite/compact.rs index 3cae9841..9e456e58 100644 --- a/codex-rs/core/tests/suite/compact.rs +++ b/codex-rs/core/tests/suite/compact.rs @@ -19,6 +19,7 @@ use wiremock::ResponseTemplate; use wiremock::matchers::method; use wiremock::matchers::path; +use codex_core::codex::compact::SUMMARIZATION_PROMPT; use core_test_support::non_sandbox_test; use core_test_support::responses::ev_assistant_message; use core_test_support::responses::ev_completed; @@ -37,7 +38,6 @@ use std::sync::atomic::Ordering; pub(super) const FIRST_REPLY: &str = "FIRST_REPLY"; pub(super) const SUMMARY_TEXT: &str = "SUMMARY_ONLY_CONTEXT"; -pub(super) const SUMMARIZE_TRIGGER: &str = "Start Summarization"; const THIRD_USER_MSG: &str = "next turn"; const AUTO_SUMMARY_TEXT: &str = "AUTO_SUMMARY"; const FIRST_AUTO_MSG: &str = "token limit start"; @@ -77,13 +77,13 @@ async fn summarize_context_three_requests_and_instructions() { let first_matcher = |req: &wiremock::Request| { let body = std::str::from_utf8(&req.body).unwrap_or(""); body.contains("\"text\":\"hello world\"") - && !body.contains(&format!("\"text\":\"{SUMMARIZE_TRIGGER}\"")) + && !body.contains("You have exceeded the maximum number of tokens") }; mount_sse_once(&server, first_matcher, sse1).await; let second_matcher = |req: &wiremock::Request| { let body = std::str::from_utf8(&req.body).unwrap_or(""); - body.contains(&format!("\"text\":\"{SUMMARIZE_TRIGGER}\"")) + body.contains("You have exceeded the maximum number of tokens") }; mount_sse_once(&server, second_matcher, sse2).await; @@ -121,7 +121,7 @@ async fn summarize_context_three_requests_and_instructions() { .unwrap(); wait_for_event(&codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await; - // 2) Summarize – second hit with summarization instructions. + // 2) Summarize – second hit should include the summarization prompt. codex.submit(Op::Compact).await.unwrap(); wait_for_event(&codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await; @@ -148,16 +148,12 @@ async fn summarize_context_three_requests_and_instructions() { let body2 = req2.body_json::().unwrap(); let body3 = req3.body_json::().unwrap(); - // System instructions should change for the summarization turn. + // Manual compact should keep the baseline developer instructions. let instr1 = body1.get("instructions").and_then(|v| v.as_str()).unwrap(); let instr2 = body2.get("instructions").and_then(|v| v.as_str()).unwrap(); - assert_ne!( + assert_eq!( instr1, instr2, - "summarization should override base instructions" - ); - assert!( - instr2.contains("You have exceeded the maximum number of tokens"), - "summarization instructions not applied" + "manual compact should keep the standard developer instructions" ); // The summarization request should include the injected user input marker. @@ -167,14 +163,14 @@ async fn summarize_context_three_requests_and_instructions() { assert_eq!(last2.get("type").unwrap().as_str().unwrap(), "message"); assert_eq!(last2.get("role").unwrap().as_str().unwrap(), "user"); let text2 = last2["content"][0]["text"].as_str().unwrap(); - assert!( - text2.contains(SUMMARIZE_TRIGGER), + assert_eq!( + text2, SUMMARIZATION_PROMPT, "expected summarize trigger, got `{text2}`" ); // Third request must contain the refreshed instructions, bridge summary message and new user msg. let input3 = body3.get("input").and_then(|v| v.as_array()).unwrap(); - println!("third request body: {body3}"); + assert!( input3.len() >= 3, "expected refreshed context and new user message in third request" @@ -215,13 +211,13 @@ async fn summarize_context_three_requests_and_instructions() { "bridge should capture earlier user messages" ); assert!( - !bridge_text.contains(SUMMARIZE_TRIGGER), + !bridge_text.contains(SUMMARIZATION_PROMPT), "bridge text should not echo the summarize trigger" ); assert!( !messages .iter() - .any(|(_, text)| text.contains(SUMMARIZE_TRIGGER)), + .any(|(_, text)| text.contains(SUMMARIZATION_PROMPT)), "third request should not include the summarize trigger" ); @@ -395,6 +391,7 @@ async fn auto_compact_runs_after_token_limit_hit() { "auto compact should add a third request" ); + let body_first = requests[0].body_json::().unwrap(); let body3 = requests[auto_compact_index] .body_json::() .unwrap(); @@ -402,9 +399,32 @@ async fn auto_compact_runs_after_token_limit_hit() { .get("instructions") .and_then(|v| v.as_str()) .unwrap_or_default(); - assert!( - instructions.contains("You have exceeded the maximum number of tokens"), - "auto compact should reuse summarization instructions" + let baseline_instructions = body_first + .get("instructions") + .and_then(|v| v.as_str()) + .unwrap_or_default() + .to_string(); + assert_eq!( + instructions, baseline_instructions, + "auto compact should keep the standard developer instructions", + ); + + let input3 = body3.get("input").and_then(|v| v.as_array()).unwrap(); + let last3 = input3 + .last() + .expect("auto compact request should append a user message"); + assert_eq!(last3.get("type").and_then(|v| v.as_str()), Some("message")); + assert_eq!(last3.get("role").and_then(|v| v.as_str()), Some("user")); + let last_text = last3 + .get("content") + .and_then(|v| v.as_array()) + .and_then(|items| items.first()) + .and_then(|item| item.get("text")) + .and_then(|text| text.as_str()) + .unwrap_or_default(); + assert_eq!( + last_text, SUMMARIZATION_PROMPT, + "auto compact should send the summarization prompt as a user message", ); } @@ -635,13 +655,25 @@ async fn auto_compact_stops_after_failed_attempt() { ); let last_body = requests[2].body_json::().unwrap(); - let instructions = last_body - .get("instructions") - .and_then(|v| v.as_str()) - .unwrap_or_default(); + let input = last_body + .get("input") + .and_then(|v| v.as_array()) + .unwrap_or_else(|| panic!("unexpected request format: {last_body}")); + let contains_prompt = input.iter().any(|item| { + item.get("type").and_then(|v| v.as_str()) == Some("message") + && item.get("role").and_then(|v| v.as_str()) == Some("user") + && item + .get("content") + .and_then(|v| v.as_array()) + .and_then(|items| items.first()) + .and_then(|entry| entry.get("text")) + .and_then(|text| text.as_str()) + .map(|text| text == SUMMARIZATION_PROMPT) + .unwrap_or(false) + }); assert!( - !instructions.contains("You have exceeded the maximum number of tokens"), - "third request should be the follow-up turn, not another summarization" + !contains_prompt, + "third request should be the follow-up turn, not another summarization", ); } @@ -785,7 +817,7 @@ async fn auto_compact_allows_multiple_attempts_when_interleaved_with_other_turn_ ); assert!( request_bodies[1].contains("You have exceeded the maximum number of tokens"), - "first auto compact request should use summarization instructions" + "first auto compact request should include the summarization prompt" ); assert!( request_bodies[3].contains(&format!("unsupported call: {DUMMY_FUNCTION_NAME}")), @@ -793,6 +825,6 @@ async fn auto_compact_allows_multiple_attempts_when_interleaved_with_other_turn_ ); assert!( request_bodies[4].contains("You have exceeded the maximum number of tokens"), - "second auto compact request should reuse summarization instructions" + "second auto compact request should include the summarization prompt" ); } diff --git a/codex-rs/core/tests/suite/compact_resume_fork.rs b/codex-rs/core/tests/suite/compact_resume_fork.rs index 43d6192f..73ad091f 100644 --- a/codex-rs/core/tests/suite/compact_resume_fork.rs +++ b/codex-rs/core/tests/suite/compact_resume_fork.rs @@ -8,7 +8,6 @@ //! model-visible history matches the expected sequence of messages. use super::compact::FIRST_REPLY; -use super::compact::SUMMARIZE_TRIGGER; use super::compact::SUMMARY_TEXT; use codex_core::CodexAuth; use codex_core::CodexConversation; @@ -16,6 +15,7 @@ use codex_core::ConversationManager; use codex_core::ModelProviderInfo; use codex_core::NewConversation; use codex_core::built_in_model_providers; +use codex_core::codex::compact::SUMMARIZATION_PROMPT; use codex_core::config::Config; use codex_core::protocol::ConversationPathResponseEvent; use codex_core::protocol::EventMsg; @@ -183,11 +183,7 @@ async fn compact_resume_and_fork_preserve_model_history_view() { let compact_1 = json!( { "model": "gpt-5-codex", - "instructions": "You have exceeded the maximum number of tokens, please stop coding and instead write a short memento message for the next agent. Your note should: -- Summarize what you finished and what still needs work. If there was a recent update_plan call, repeat its steps verbatim. -- List outstanding TODOs with file paths / line numbers so they're easy to find. -- Flag code that needs more tests (edge cases, performance, integration, etc.). -- Record any open bugs, quirks, or setup steps that will make it easier for the next agent to pick up where you left off.", + "instructions": prompt, "input": [ { "type": "message", @@ -235,7 +231,7 @@ async fn compact_resume_and_fork_preserve_model_history_view() { "content": [ { "type": "input_text", - "text": "Start Summarization" + "text": SUMMARIZATION_PROMPT } ] } @@ -488,13 +484,14 @@ SUMMARY_ONLY_CONTEXT" ], "prompt_cache_key": fork_prompt_cache_key }); - let expected = json!([ + let mut expected = json!([ user_turn_1, compact_1, user_turn_2_after_compact, usert_turn_3_after_resume, user_turn_3_after_fork ]); + normalize_line_endings(&mut expected); assert_eq!(requests.len(), 5); assert_eq!(json!(requests), expected); } @@ -580,7 +577,7 @@ async fn compact_resume_after_second_compaction_preserves_history() { .unwrap_or_default() .to_string(); - let expected = json!([ + let mut expected = json!([ { "instructions": prompt, "input": [ @@ -637,6 +634,7 @@ async fn compact_resume_after_second_compaction_preserves_history() { ], } ]); + normalize_line_endings(&mut expected); let last_request_after_2_compacts = json!([{ "instructions": requests[requests.len() -1]["instructions"], "input": requests[requests.len() -1]["input"], @@ -698,7 +696,8 @@ async fn mount_initial_flow(server: &MockServer) { let match_first = |req: &wiremock::Request| { let body = std::str::from_utf8(&req.body).unwrap_or(""); body.contains("\"text\":\"hello world\"") - && !body.contains(&format!("\"text\":\"{SUMMARIZE_TRIGGER}\"")) + && !body.contains("You have exceeded the maximum number of tokens") + && !body.contains(&format!("\"text\":\"{SUMMARY_TEXT}\"")) && !body.contains("\"text\":\"AFTER_COMPACT\"") && !body.contains("\"text\":\"AFTER_RESUME\"") && !body.contains("\"text\":\"AFTER_FORK\"") @@ -707,7 +706,7 @@ async fn mount_initial_flow(server: &MockServer) { let match_compact = |req: &wiremock::Request| { let body = std::str::from_utf8(&req.body).unwrap_or(""); - body.contains(&format!("\"text\":\"{SUMMARIZE_TRIGGER}\"")) + body.contains("You have exceeded the maximum number of tokens") }; mount_sse_once(server, match_compact, sse2).await; @@ -741,7 +740,8 @@ async fn mount_second_compact_flow(server: &MockServer) { let match_second_compact = |req: &wiremock::Request| { let body = std::str::from_utf8(&req.body).unwrap_or(""); - body.contains(&format!("\"text\":\"{SUMMARIZE_TRIGGER}\"")) && body.contains("AFTER_FORK") + body.contains("You have exceeded the maximum number of tokens") + && body.contains("AFTER_FORK") }; mount_sse_once(server, match_second_compact, sse6).await;