From 2b122da087e5515462733e54d540dec88b6ab134 Mon Sep 17 00:00:00 2001 From: Michael Bolin Date: Sat, 10 May 2025 17:52:59 -0700 Subject: [PATCH] feat: add support for AGENTS.md in Rust CLI (#885) The TypeScript CLI already has support for including the contents of `AGENTS.md` in the instructions sent with the first turn of a conversation. This PR brings this functionality to the Rust CLI. To be considered, `AGENTS.md` must be in the `cwd` of the session, or in one of the parent folders up to a Git/filesystem root (whichever is encountered first). By default, a maximum of 32 KiB of `AGENTS.md` will be included, though this is configurable using the new-in-this-PR `project_doc_max_bytes` option in `config.toml`. --- AGENTS.md | 5 + codex-rs/README.md | 4 + codex-rs/core/src/codex.rs | 5 +- codex-rs/core/src/config.rs | 12 ++ codex-rs/core/src/lib.rs | 1 + codex-rs/core/src/project_doc.rs | 275 +++++++++++++++++++++++++++++++ 6 files changed, 301 insertions(+), 1 deletion(-) create mode 100644 AGENTS.md create mode 100644 codex-rs/core/src/project_doc.rs diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 00000000..1348e578 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,5 @@ +# Rust/codex-rs + +In the codex-rs folder where the rust code lives: + +- Never add or modify any code related to `CODEX_SANDBOX_NETWORK_DISABLED_ENV_VAR`. You operate in a sandbox where `CODEX_SANDBOX_NETWORK_DISABLED=1` will be set whenever you use the `shell` tool. Any existing code that uses `CODEX_SANDBOX_NETWORK_DISABLED_ENV_VAR` was authored with this fact in mind. It is often used to early exit out of tests that the author knew you would not be able to run given your sandbox limitations. diff --git a/codex-rs/README.md b/codex-rs/README.md index d49a5949..827a5659 100644 --- a/codex-rs/README.md +++ b/codex-rs/README.md @@ -250,3 +250,7 @@ To have Codex use this script for notifications, you would configure it via `not ```toml notify = ["python3", "/Users/mbolin/.codex/notify.py"] ``` + +### project_doc_max_bytes + +Maximum number of bytes to read from an `AGENTS.md` file to include in the instructions sent with the first turn of a session. Defaults to 32 KiB. diff --git a/codex-rs/core/src/codex.rs b/codex-rs/core/src/codex.rs index 5cd5a679..6366d30c 100644 --- a/codex-rs/core/src/codex.rs +++ b/codex-rs/core/src/codex.rs @@ -52,6 +52,7 @@ use crate::models::FunctionCallOutputPayload; use crate::models::ResponseInputItem; use crate::models::ResponseItem; use crate::models::ShellToolCallParams; +use crate::project_doc::create_full_instructions; use crate::protocol::AskForApproval; use crate::protocol::Event; use crate::protocol::EventMsg; @@ -83,10 +84,12 @@ impl Codex { pub async fn spawn(config: Config, ctrl_c: Arc) -> CodexResult<(Codex, String)> { let (tx_sub, rx_sub) = async_channel::bounded(64); let (tx_event, rx_event) = async_channel::bounded(64); + + let instructions = create_full_instructions(&config).await; let configure_session = Op::ConfigureSession { provider: config.model_provider.clone(), model: config.model.clone(), - instructions: config.instructions.clone(), + instructions, approval_policy: config.approval_policy, sandbox_policy: config.sandbox_policy.clone(), disable_response_storage: config.disable_response_storage, diff --git a/codex-rs/core/src/config.rs b/codex-rs/core/src/config.rs index 2264792b..2e5b3f19 100644 --- a/codex-rs/core/src/config.rs +++ b/codex-rs/core/src/config.rs @@ -15,6 +15,11 @@ use std::path::PathBuf; /// correctly even if the user has not created `~/.codex/instructions.md`. const EMBEDDED_INSTRUCTIONS: &str = include_str!("../prompt.md"); +/// Maximum number of bytes of the documentation that will be embedded. Larger +/// files are *silently truncated* to this size so we do not take up too much of +/// the context window. +pub(crate) const PROJECT_DOC_MAX_BYTES: usize = 32 * 1024; // 32 KiB + /// Application configuration loaded from disk and merged with overrides. #[derive(Debug, Clone)] pub struct Config { @@ -72,6 +77,9 @@ pub struct Config { /// Combined provider map (defaults merged with user-defined overrides). pub model_providers: HashMap, + + /// Maximum number of bytes to include from an AGENTS.md project doc file. + pub project_doc_max_bytes: usize, } /// Base config deserialized from ~/.codex/config.toml. @@ -111,6 +119,9 @@ pub struct ConfigToml { /// User-defined provider entries that extend/override the built-in list. #[serde(default)] pub model_providers: HashMap, + + /// Maximum number of bytes to include from an AGENTS.md project doc file. + pub project_doc_max_bytes: Option, } impl ConfigToml { @@ -267,6 +278,7 @@ impl Config { instructions, mcp_servers: cfg.mcp_servers, model_providers, + project_doc_max_bytes: cfg.project_doc_max_bytes.unwrap_or(PROJECT_DOC_MAX_BYTES), }; Ok(config) } diff --git a/codex-rs/core/src/lib.rs b/codex-rs/core/src/lib.rs index 3e7fd7f7..43c97a87 100644 --- a/codex-rs/core/src/lib.rs +++ b/codex-rs/core/src/lib.rs @@ -28,6 +28,7 @@ mod model_provider_info; pub use model_provider_info::ModelProviderInfo; pub use model_provider_info::WireApi; mod models; +mod project_doc; pub mod protocol; mod rollout; mod safety; diff --git a/codex-rs/core/src/project_doc.rs b/codex-rs/core/src/project_doc.rs new file mode 100644 index 00000000..d468d61d --- /dev/null +++ b/codex-rs/core/src/project_doc.rs @@ -0,0 +1,275 @@ +//! Project-level documentation discovery. +//! +//! Project-level documentation can be stored in a file named `AGENTS.md`. +//! Currently, we include only the contents of the first file found as follows: +//! +//! 1. Look for the doc file in the current working directory (as determined +//! by the `Config`). +//! 2. If not found, walk *upwards* until the Git repository root is reached +//! (detected by the presence of a `.git` directory/file), or failing that, +//! the filesystem root. +//! 3. If the Git root is encountered, look for the doc file there. If it +//! exists, the search stops – we do **not** walk past the Git root. + +use crate::config::Config; +use std::path::Path; +use tokio::io::AsyncReadExt; +use tracing::error; + +/// Currently, we only match the filename `AGENTS.md` exactly. +const CANDIDATE_FILENAMES: &[&str] = &["AGENTS.md"]; + +/// When both `Config::instructions` and the project doc are present, they will +/// be concatenated with the following separator. +const PROJECT_DOC_SEPARATOR: &str = "\n\n--- project-doc ---\n\n"; + +/// Combines `Config::instructions` and `AGENTS.md` (if present) into a single +/// string of instructions. +pub(crate) async fn create_full_instructions(config: &Config) -> Option { + match find_project_doc(config).await { + Ok(Some(project_doc)) => match &config.instructions { + Some(original_instructions) => Some(format!( + "{original_instructions}{PROJECT_DOC_SEPARATOR}{project_doc}" + )), + None => Some(project_doc), + }, + Ok(None) => config.instructions.clone(), + Err(e) => { + error!("error trying to find project doc: {e:#}"); + config.instructions.clone() + } + } +} + +/// Attempt to locate and load the project documentation. Currently, the search +/// starts from `Config::cwd`, but if we may want to consider other directories +/// in the future, e.g., additional writable directories in the `SandboxPolicy`. +/// +/// On success returns `Ok(Some(contents))`. If no documentation file is found +/// the function returns `Ok(None)`. Unexpected I/O failures bubble up as +/// `Err` so callers can decide how to handle them. +async fn find_project_doc(config: &Config) -> std::io::Result> { + let max_bytes = config.project_doc_max_bytes; + + // Attempt to load from the working directory first. + if let Some(doc) = load_first_candidate(&config.cwd, CANDIDATE_FILENAMES, max_bytes).await? { + return Ok(Some(doc)); + } + + // Walk up towards the filesystem root, stopping once we encounter the Git + // repository root. The presence of **either** a `.git` *file* or + // *directory* counts. + let mut dir = config.cwd.clone(); + + // Canonicalize the path so that we do not end up in an infinite loop when + // `cwd` contains `..` components. + if let Ok(canon) = dir.canonicalize() { + dir = canon; + } + + while let Some(parent) = dir.parent() { + // `.git` can be a *file* (for worktrees or submodules) or a *dir*. + let git_marker = dir.join(".git"); + let git_exists = match tokio::fs::metadata(&git_marker).await { + Ok(_) => true, + Err(e) if e.kind() == std::io::ErrorKind::NotFound => false, + Err(e) => return Err(e), + }; + + if git_exists { + // We are at the repo root – attempt one final load. + if let Some(doc) = load_first_candidate(&dir, CANDIDATE_FILENAMES, max_bytes).await? { + return Ok(Some(doc)); + } + break; + } + + dir = parent.to_path_buf(); + } + + Ok(None) +} + +/// Attempt to load the first candidate file found in `dir`. Returns the file +/// contents (truncated if it exceeds `max_bytes`) when successful. +async fn load_first_candidate( + dir: &Path, + names: &[&str], + max_bytes: usize, +) -> std::io::Result> { + for name in names { + let candidate = dir.join(name); + + let file = match tokio::fs::File::open(&candidate).await { + Err(e) if e.kind() == std::io::ErrorKind::NotFound => continue, + Err(e) => return Err(e), + Ok(f) => f, + }; + + let size = file.metadata().await?.len(); + + let reader = tokio::io::BufReader::new(file); + let mut data = Vec::with_capacity(std::cmp::min(size as usize, max_bytes)); + let mut limited = reader.take(max_bytes as u64); + limited.read_to_end(&mut data).await?; + + if size as usize > max_bytes { + tracing::warn!( + "Project doc `{}` exceeds {max_bytes} bytes - truncating.", + candidate.display(), + ); + } + + let contents = String::from_utf8_lossy(&data).to_string(); + if contents.trim().is_empty() { + // Empty file – treat as not found. + continue; + } + + return Ok(Some(contents)); + } + + Ok(None) +} + +#[cfg(test)] +mod tests { + #![allow(clippy::unwrap_used)] + + use super::*; + use crate::config::Config; + use std::fs; + use tempfile::TempDir; + + /// Helper that returns a `Config` pointing at `root` and using `limit` as + /// the maximum number of bytes to embed from AGENTS.md. The caller can + /// optionally specify a custom `instructions` string – when `None` the + /// value is cleared to mimic a scenario where no system instructions have + /// been configured. + fn make_config(root: &TempDir, limit: usize, instructions: Option<&str>) -> Config { + let mut cfg = Config::load_default_config_for_test(); + cfg.cwd = root.path().to_path_buf(); + cfg.project_doc_max_bytes = limit; + + cfg.instructions = instructions.map(ToOwned::to_owned); + cfg + } + + /// AGENTS.md missing – should yield `None`. + #[tokio::test] + async fn no_doc_file_returns_none() { + let tmp = tempfile::tempdir().expect("tempdir"); + + let res = create_full_instructions(&make_config(&tmp, 4096, None)).await; + assert!( + res.is_none(), + "Expected None when AGENTS.md is absent and no system instructions provided" + ); + assert!(res.is_none(), "Expected None when AGENTS.md is absent"); + } + + /// Small file within the byte-limit is returned unmodified. + #[tokio::test] + async fn doc_smaller_than_limit_is_returned() { + let tmp = tempfile::tempdir().expect("tempdir"); + fs::write(tmp.path().join("AGENTS.md"), "hello world").unwrap(); + + let res = create_full_instructions(&make_config(&tmp, 4096, None)) + .await + .expect("doc expected"); + + assert_eq!( + res, "hello world", + "The document should be returned verbatim when it is smaller than the limit and there are no existing instructions" + ); + } + + /// Oversize file is truncated to `project_doc_max_bytes`. + #[tokio::test] + async fn doc_larger_than_limit_is_truncated() { + const LIMIT: usize = 1024; + let tmp = tempfile::tempdir().expect("tempdir"); + + let huge = "A".repeat(LIMIT * 2); // 2 KiB + fs::write(tmp.path().join("AGENTS.md"), &huge).unwrap(); + + let res = create_full_instructions(&make_config(&tmp, LIMIT, None)) + .await + .expect("doc expected"); + + assert_eq!(res.len(), LIMIT, "doc should be truncated to LIMIT bytes"); + assert_eq!(res, huge[..LIMIT]); + } + + /// When `cwd` is nested inside a repo, the search should locate AGENTS.md + /// placed at the repository root (identified by `.git`). + #[tokio::test] + async fn finds_doc_in_repo_root() { + let repo = tempfile::tempdir().expect("tempdir"); + + // Simulate a git repository. Note .git can be a file or a directory. + std::fs::write( + repo.path().join(".git"), + "gitdir: /path/to/actual/git/dir\n", + ) + .unwrap(); + + // Put the doc at the repo root. + fs::write(repo.path().join("AGENTS.md"), "root level doc").unwrap(); + + // Now create a nested working directory: repo/workspace/crate_a + let nested = repo.path().join("workspace/crate_a"); + std::fs::create_dir_all(&nested).unwrap(); + + // Build config pointing at the nested dir. + let mut cfg = make_config(&repo, 4096, None); + cfg.cwd = nested; + + let res = create_full_instructions(&cfg).await.expect("doc expected"); + assert_eq!(res, "root level doc"); + } + + /// Explicitly setting the byte-limit to zero disables project docs. + #[tokio::test] + async fn zero_byte_limit_disables_docs() { + let tmp = tempfile::tempdir().expect("tempdir"); + fs::write(tmp.path().join("AGENTS.md"), "something").unwrap(); + + let res = create_full_instructions(&make_config(&tmp, 0, None)).await; + assert!( + res.is_none(), + "With limit 0 the function should return None" + ); + } + + /// When both system instructions *and* a project doc are present the two + /// should be concatenated with the separator. + #[tokio::test] + async fn merges_existing_instructions_with_project_doc() { + let tmp = tempfile::tempdir().expect("tempdir"); + fs::write(tmp.path().join("AGENTS.md"), "proj doc").unwrap(); + + const INSTRUCTIONS: &str = "base instructions"; + + let res = create_full_instructions(&make_config(&tmp, 4096, Some(INSTRUCTIONS))) + .await + .expect("should produce a combined instruction string"); + + let expected = format!("{INSTRUCTIONS}{PROJECT_DOC_SEPARATOR}{}", "proj doc"); + + assert_eq!(res, expected); + } + + /// If there are existing system instructions but the project doc is + /// missing we expect the original instructions to be returned unchanged. + #[tokio::test] + async fn keeps_existing_instructions_when_doc_missing() { + let tmp = tempfile::tempdir().expect("tempdir"); + + const INSTRUCTIONS: &str = "some instructions"; + + let res = create_full_instructions(&make_config(&tmp, 4096, Some(INSTRUCTIONS))).await; + + assert_eq!(res, Some(INSTRUCTIONS.to_string())); + } +}