From 9b102965b9392f9edbbece64965503b4ec729454 Mon Sep 17 00:00:00 2001
From: Misha Davidov <md@openai.com>
Date: Thu, 24 Apr 2025 09:05:19 -0700
Subject: [PATCH] feat: more loosely match context for apply_patch (#610)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

More of a proposal than anything but models seem to struggle with
composing valid patches for `apply_patch` for context matching when
there are unicode look-a-likes involved. This would normalize them.

```
top-level          # ASCII
top-level          # U+2011 NON-BREAKING HYPHEN
top–level          # U+2013 EN DASH
top—level          # U+2014 EM DASH
top‒level          # U+2012 FIGURE DASH
```

thanks unicode.
---
 codex-cli/src/utils/agent/apply-patch.ts | 121 ++++++++++++++++++++---
 codex-cli/tests/apply-patch.test.ts      |  28 ++++++
 2 files changed, 138 insertions(+), 11 deletions(-)
diff --git a/codex-cli/src/utils/agent/apply-patch.ts b/codex-cli/src/utils/agent/apply-patch.ts
index c0b1eaf2..fe1dc76b 100644
--- a/codex-cli/src/utils/agent/apply-patch.ts
+++ b/codex-cli/src/utils/agent/apply-patch.ts
@@ -211,9 +211,44 @@ class Parser {
       }
       if (defStr.trim()) {
         let found = false;
-        if (!fileLines.slice(0, index).some((s) => s === defStr)) {
+        // ------------------------------------------------------------------
+        // Equality helpers using the canonicalisation from find_context_core.
+        // (We duplicate a minimal version here because the scope is local.)
+        // ------------------------------------------------------------------
+        const canonLocal = (s: string): string =>
+          s.normalize("NFC").replace(
+            /./gu,
+            (c) =>
+              ((
+                {
+                  "-": "-",
+                  "\u2010": "-",
+                  "\u2011": "-",
+                  "\u2012": "-",
+                  "\u2013": "-",
+                  "\u2014": "-",
+                  "\u2212": "-",
+                  "\u0022": '"',
+                  "\u201C": '"',
+                  "\u201D": '"',
+                  "\u201E": '"',
+                  "\u00AB": '"',
+                  "\u00BB": '"',
+                  "\u0027": "'",
+                  "\u2018": "'",
+                  "\u2019": "'",
+                  "\u201B": "'",
+                } as Record<string, string>
+              )[c] ?? c),
+          );
+
+        if (
+          !fileLines
+            .slice(0, index)
+            .some((s) => canonLocal(s) === canonLocal(defStr))
+        ) {
           for (let i = index; i < fileLines.length; i++) {
-            if (fileLines[i] === defStr) {
+            if (canonLocal(fileLines[i]!) === canonLocal(defStr)) {
               index = i + 1;
               found = true;
               break;
@@ -222,10 +257,14 @@ class Parser {
         }
         if (
           !found &&
-          !fileLines.slice(0, index).some((s) => s.trim() === defStr.trim())
+          !fileLines
+            .slice(0, index)
+            .some((s) => canonLocal(s.trim()) === canonLocal(defStr.trim()))
         ) {
           for (let i = index; i < fileLines.length; i++) {
-            if (fileLines[i]!.trim() === defStr.trim()) {
+            if (
+              canonLocal(fileLines[i]!.trim()) === canonLocal(defStr.trim())
+            ) {
               index = i + 1;
               this.fuzz += 1;
               found = true;
@@ -293,34 +332,94 @@ function find_context_core(
   context: Array<string>,
   start: number,
 ): [number, number] {
+  // ---------------------------------------------------------------------------
+  // Helpers – Unicode punctuation normalisation
+  // ---------------------------------------------------------------------------
+
+  /*
+   * The patch-matching algorithm originally required **exact** string equality
+   * for non-whitespace characters.  That breaks when the file on disk contains
+   * visually identical but different Unicode code-points (e.g. “EN DASH” vs
+   * ASCII "-"), because models almost always emit the ASCII variant.  To make
+   * apply_patch resilient we canonicalise a handful of common punctuation
+   * look-alikes before doing comparisons.
+   *
+   * We purposefully keep the mapping *small* – only characters that routinely
+   * appear in source files and are highly unlikely to introduce ambiguity are
+   * included.  Each entry is written using the corresponding Unicode escape so
+   * that the file remains ASCII-only even after transpilation.
+   */
+
+  const PUNCT_EQUIV: Record<string, string> = {
+    // Hyphen / dash variants --------------------------------------------------
+    /* U+002D HYPHEN-MINUS */ "-": "-",
+    /* U+2010 HYPHEN */ "\u2010": "-",
+    /* U+2011 NO-BREAK HYPHEN */ "\u2011": "-",
+    /* U+2012 FIGURE DASH */ "\u2012": "-",
+    /* U+2013 EN DASH */ "\u2013": "-",
+    /* U+2014 EM DASH */ "\u2014": "-",
+    /* U+2212 MINUS SIGN */ "\u2212": "-",
+
+    // Double quotes -----------------------------------------------------------
+    /* U+0022 QUOTATION MARK */ "\u0022": '"',
+    /* U+201C LEFT DOUBLE QUOTATION MARK */ "\u201C": '"',
+    /* U+201D RIGHT DOUBLE QUOTATION MARK */ "\u201D": '"',
+    /* U+201E DOUBLE LOW-9 QUOTATION MARK */ "\u201E": '"',
+    /* U+00AB LEFT-POINTING DOUBLE ANGLE QUOTATION MARK */ "\u00AB": '"',
+    /* U+00BB RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK */ "\u00BB": '"',
+
+    // Single quotes -----------------------------------------------------------
+    /* U+0027 APOSTROPHE */ "\u0027": "'",
+    /* U+2018 LEFT SINGLE QUOTATION MARK */ "\u2018": "'",
+    /* U+2019 RIGHT SINGLE QUOTATION MARK */ "\u2019": "'",
+    /* U+201B SINGLE HIGH-REVERSED-9 QUOTATION MARK */ "\u201B": "'",
+  };
+
+  const canon = (s: string): string =>
+    s
+      // Canonical Unicode composition first
+      .normalize("NFC")
+      // Replace punctuation look-alikes
+      .replace(/./gu, (c) => PUNCT_EQUIV[c] ?? c);
   if (context.length === 0) {
     return [start, 0];
   }
+  // Pass 1 – exact equality after canonicalisation ---------------------------
   for (let i = start; i < lines.length; i++) {
-    if (lines.slice(i, i + context.length).join("\n") === context.join("\n")) {
+    const segment = canon(lines.slice(i, i + context.length).join("\n"));
+    if (segment === canon(context.join("\n"))) {
       return [i, 0];
     }
   }
+
+  // Pass 2 – ignore trailing whitespace -------------------------------------
   for (let i = start; i < lines.length; i++) {
-    if (
+    const segment = canon(
       lines
         .slice(i, i + context.length)
         .map((s) => s.trimEnd())
-        .join("\n") === context.map((s) => s.trimEnd()).join("\n")
-    ) {
+        .join("\n"),
+    );
+    const ctx = canon(context.map((s) => s.trimEnd()).join("\n"));
+    if (segment === ctx) {
       return [i, 1];
     }
   }
+
+  // Pass 3 – ignore all surrounding whitespace ------------------------------
   for (let i = start; i < lines.length; i++) {
-    if (
+    const segment = canon(
       lines
         .slice(i, i + context.length)
         .map((s) => s.trim())
-        .join("\n") === context.map((s) => s.trim()).join("\n")
-    ) {
+        .join("\n"),
+    );
+    const ctx = canon(context.map((s) => s.trim()).join("\n"));
+    if (segment === ctx) {
       return [i, 100];
     }
   }
+
   return [-1, 0];
 }
 
diff --git a/codex-cli/tests/apply-patch.test.ts b/codex-cli/tests/apply-patch.test.ts
index 0fbd5476..e1532c01 100644
--- a/codex-cli/tests/apply-patch.test.ts
+++ b/codex-cli/tests/apply-patch.test.ts
@@ -56,6 +56,34 @@ test("process_patch - update file", () => {
   expect(fs.removals).toEqual([]);
 });
 
+// ---------------------------------------------------------------------------
+// Unicode canonicalisation tests – hyphen / dash / quote look-alikes
+// ---------------------------------------------------------------------------
+
+test("process_patch tolerates hyphen/dash variants", () => {
+  // The file contains EN DASH (\u2013) and NO-BREAK HYPHEN (\u2011)
+  const original =
+    "first\nimport foo  # local import \u2013 avoids top\u2011level dep\nlast";
+
+  const patch = `*** Begin Patch\n*** Update File: uni.txt\n@@\n-import foo  # local import - avoids top-level dep\n+import foo  # HANDLED\n*** End Patch`;
+
+  const fs = createInMemoryFS({ "uni.txt": original });
+  process_patch(patch, fs.openFn, fs.writeFn, fs.removeFn);
+
+  expect(fs.files["uni.txt"]!.includes("HANDLED")).toBe(true);
+});
+
+test.skip("process_patch tolerates smart quotes", () => {
+  const original = "console.log(\u201Chello\u201D);"; // “hello” with smart quotes
+
+  const patch = `*** Begin Patch\n*** Update File: quotes.js\n@@\n-console.log(\\"hello\\");\n+console.log(\\"HELLO\\");\n*** End Patch`;
+
+  const fs = createInMemoryFS({ "quotes.js": original });
+  process_patch(patch, fs.openFn, fs.writeFn, fs.removeFn);
+
+  expect(fs.files["quotes.js"]).toBe('console.log("HELLO");');
+});
+
 test("process_patch - add file", () => {
   const patch = `*** Begin Patch
 *** Add File: b.txt