fix: handling weird unicode characters in apply_patch (#674)

I � unicode
2025-04-25 16:01:58 -07:00
parent c18f1689a9
commit 15bf5ca971
2 changed files with 88 additions and 0 deletions
--- a/codex-rs/apply-patch/src/lib.rs
+++ b/codex-rs/apply-patch/src/lib.rs
@@ -820,6 +820,51 @@ PATCH"#,
        assert_eq!(contents, "a\nB\nc\nd\nE\nf\ng\n");
    }

+    /// Ensure that patches authored with ASCII characters can update lines that
+    /// contain typographic Unicode punctuation (e.g. EN DASH, NON-BREAKING
+    /// HYPHEN). Historically `git apply` succeeds in such scenarios but our
+    /// internal matcher failed requiring an exact byte-for-byte match.  The
+    /// fuzzy-matching pass that normalises common punctuation should now bridge
+    /// the gap.
+    #[test]
+    fn test_update_line_with_unicode_dash() {
+        let dir = tempdir().unwrap();
+        let path = dir.path().join("unicode.py");
+
+        // Original line contains EN DASH (\u{2013}) and NON-BREAKING HYPHEN (\u{2011}).
+        let original = "import asyncio  # local import \u{2013} avoids top\u{2011}level dep\n";
+        std::fs::write(&path, original).unwrap();
+
+        // Patch uses plain ASCII dash / hyphen.
+        let patch = wrap_patch(&format!(
+            r#"*** Update File: {}
+@@
+-import asyncio  # local import - avoids top-level dep
+import asyncio  # HELLO"#,
+            path.display()
+        ));
+
+        let mut stdout = Vec::new();
+        let mut stderr = Vec::new();
+        apply_patch(&patch, &mut stdout, &mut stderr).unwrap();
+
+        // File should now contain the replaced comment.
+        let expected = "import asyncio  # HELLO\n";
+        let contents = std::fs::read_to_string(&path).unwrap();
+        assert_eq!(contents, expected);
+
+        // Ensure success summary lists the file as modified.
+        let stdout_str = String::from_utf8(stdout).unwrap();
+        let expected_out = format!(
+            "Success. Updated the following files:\nM {}\n",
+            path.display()
+        );
+        assert_eq!(stdout_str, expected_out);
+
+        // No stderr expected.
+        assert_eq!(String::from_utf8(stderr).unwrap(), "");
+    }
+
    #[test]
    fn test_unified_diff() {
        // Start with a file containing four lines.
--- a/codex-rs/apply-patch/src/seek_sequence.rs
+++ b/codex-rs/apply-patch/src/seek_sequence.rs
@@ -63,6 +63,49 @@ pub(crate) fn seek_sequence(
            return Some(i);
        }
    }
+
+    // ------------------------------------------------------------------
+    // Final, most permissive pass – attempt to match after *normalising*
+    // common Unicode punctuation to their ASCII equivalents so that diffs
+    // authored with plain ASCII characters can still be applied to source
+    // files that contain typographic dashes / quotes, etc.  This mirrors the
+    // fuzzy behaviour of `git apply` which ignores minor byte-level
+    // differences when locating context lines.
+    // ------------------------------------------------------------------
+
+    fn normalise(s: &str) -> String {
+        s.trim()
+            .chars()
+            .map(|c| match c {
+                // Various dash / hyphen code-points → ASCII '-'
+                '\u{2010}' | '\u{2011}' | '\u{2012}' | '\u{2013}' | '\u{2014}' | '\u{2015}'
+                | '\u{2212}' => '-',
+                // Fancy single quotes → '\''
+                '\u{2018}' | '\u{2019}' | '\u{201A}' | '\u{201B}' => '\'',
+                // Fancy double quotes → '"'
+                '\u{201C}' | '\u{201D}' | '\u{201E}' | '\u{201F}' => '"',
+                // Non-breaking space and other odd spaces → normal space
+                '\u{00A0}' | '\u{2002}' | '\u{2003}' | '\u{2004}' | '\u{2005}' | '\u{2006}'
+                | '\u{2007}' | '\u{2008}' | '\u{2009}' | '\u{200A}' | '\u{202F}' | '\u{205F}'
+                | '\u{3000}' => ' ',
+                other => other,
+            })
+            .collect::<String>()
+    }
+
+    for i in search_start..=lines.len().saturating_sub(pattern.len()) {
+        let mut ok = true;
+        for (p_idx, pat) in pattern.iter().enumerate() {
+            if normalise(&lines[i + p_idx]) != normalise(pat) {
+                ok = false;
+                break;
+            }
+        }
+        if ok {
+            return Some(i);
+        }
+    }
+
    None
 }