diff --git a/codex-rs/apply-patch/src/lib.rs b/codex-rs/apply-patch/src/lib.rs index 05ea7496..bd9e4044 100644 --- a/codex-rs/apply-patch/src/lib.rs +++ b/codex-rs/apply-patch/src/lib.rs @@ -820,6 +820,51 @@ PATCH"#, assert_eq!(contents, "a\nB\nc\nd\nE\nf\ng\n"); } + /// Ensure that patches authored with ASCII characters can update lines that + /// contain typographic Unicode punctuation (e.g. EN DASH, NON-BREAKING + /// HYPHEN). Historically `git apply` succeeds in such scenarios but our + /// internal matcher failed requiring an exact byte-for-byte match. The + /// fuzzy-matching pass that normalises common punctuation should now bridge + /// the gap. + #[test] + fn test_update_line_with_unicode_dash() { + let dir = tempdir().unwrap(); + let path = dir.path().join("unicode.py"); + + // Original line contains EN DASH (\u{2013}) and NON-BREAKING HYPHEN (\u{2011}). + let original = "import asyncio # local import \u{2013} avoids top\u{2011}level dep\n"; + std::fs::write(&path, original).unwrap(); + + // Patch uses plain ASCII dash / hyphen. + let patch = wrap_patch(&format!( + r#"*** Update File: {} +@@ +-import asyncio # local import - avoids top-level dep ++import asyncio # HELLO"#, + path.display() + )); + + let mut stdout = Vec::new(); + let mut stderr = Vec::new(); + apply_patch(&patch, &mut stdout, &mut stderr).unwrap(); + + // File should now contain the replaced comment. + let expected = "import asyncio # HELLO\n"; + let contents = std::fs::read_to_string(&path).unwrap(); + assert_eq!(contents, expected); + + // Ensure success summary lists the file as modified. + let stdout_str = String::from_utf8(stdout).unwrap(); + let expected_out = format!( + "Success. Updated the following files:\nM {}\n", + path.display() + ); + assert_eq!(stdout_str, expected_out); + + // No stderr expected. + assert_eq!(String::from_utf8(stderr).unwrap(), ""); + } + #[test] fn test_unified_diff() { // Start with a file containing four lines. diff --git a/codex-rs/apply-patch/src/seek_sequence.rs b/codex-rs/apply-patch/src/seek_sequence.rs index c379767d..0144580f 100644 --- a/codex-rs/apply-patch/src/seek_sequence.rs +++ b/codex-rs/apply-patch/src/seek_sequence.rs @@ -63,6 +63,49 @@ pub(crate) fn seek_sequence( return Some(i); } } + + // ------------------------------------------------------------------ + // Final, most permissive pass – attempt to match after *normalising* + // common Unicode punctuation to their ASCII equivalents so that diffs + // authored with plain ASCII characters can still be applied to source + // files that contain typographic dashes / quotes, etc. This mirrors the + // fuzzy behaviour of `git apply` which ignores minor byte-level + // differences when locating context lines. + // ------------------------------------------------------------------ + + fn normalise(s: &str) -> String { + s.trim() + .chars() + .map(|c| match c { + // Various dash / hyphen code-points → ASCII '-' + '\u{2010}' | '\u{2011}' | '\u{2012}' | '\u{2013}' | '\u{2014}' | '\u{2015}' + | '\u{2212}' => '-', + // Fancy single quotes → '\'' + '\u{2018}' | '\u{2019}' | '\u{201A}' | '\u{201B}' => '\'', + // Fancy double quotes → '"' + '\u{201C}' | '\u{201D}' | '\u{201E}' | '\u{201F}' => '"', + // Non-breaking space and other odd spaces → normal space + '\u{00A0}' | '\u{2002}' | '\u{2003}' | '\u{2004}' | '\u{2005}' | '\u{2006}' + | '\u{2007}' | '\u{2008}' | '\u{2009}' | '\u{200A}' | '\u{202F}' | '\u{205F}' + | '\u{3000}' => ' ', + other => other, + }) + .collect::() + } + + for i in search_start..=lines.len().saturating_sub(pattern.len()) { + let mut ok = true; + for (p_idx, pat) in pattern.iter().enumerate() { + if normalise(&lines[i + p_idx]) != normalise(pat) { + ok = false; + break; + } + } + if ok { + return Some(i); + } + } + None }