diff --git a/llmx-rs/core/src/chat_completions.rs b/llmx-rs/core/src/chat_completions.rs index 139b0d24..2ded19a0 100644 --- a/llmx-rs/core/src/chat_completions.rs +++ b/llmx-rs/core/src/chat_completions.rs @@ -56,7 +56,12 @@ pub(crate) async fn stream_chat_completions( let mut messages = Vec::::new(); let full_instructions = prompt.get_full_instructions(model_family); - messages.push(json!({"role": "system", "content": full_instructions})); + // Add cache_control to system instructions for Anthropic prompt caching + messages.push(json!({ + "role": "system", + "content": full_instructions, + "cache_control": {"type": "ephemeral"} + })); let input = prompt.get_formatted_input(); @@ -413,6 +418,20 @@ pub(crate) async fn stream_chat_completions( } debug!("Built {} messages for API request", messages.len()); + + // Add cache_control to conversation history for Anthropic prompt caching + // Add it to a message that's at least 3 messages before the end (stable history) + // This caches the earlier conversation while keeping recent turns uncached + if messages.len() > 4 { + let cache_idx = messages.len().saturating_sub(4); + if let Some(msg) = messages.get_mut(cache_idx) { + if let Some(obj) = msg.as_object_mut() { + obj.insert("cache_control".to_string(), json!({"type": "ephemeral"})); + debug!("Added cache_control to message at index {} (conversation history)", cache_idx); + } + } + } + debug!("=== End Chat Completions Request Debug ==="); let tools_json = create_tools_json_for_chat_completions_api(&prompt.tools)?;