feat: more loosely match context for apply_patch (#610)

More of a proposal than anything but models seem to struggle with
composing valid patches for `apply_patch` for context matching when
there are unicode look-a-likes involved. This would normalize them.

```
top-level          # ASCII
top-level          # U+2011 NON-BREAKING HYPHEN
top–level          # U+2013 EN DASH
top—level          # U+2014 EM DASH
top‒level          # U+2012 FIGURE DASH
```

thanks unicode.
This commit is contained in:
Misha Davidov
2025-04-24 09:05:19 -07:00
committed by GitHub
parent ad1e39c903
commit 9b102965b9
2 changed files with 138 additions and 11 deletions

View File

@@ -211,9 +211,44 @@ class Parser {
}
if (defStr.trim()) {
let found = false;
if (!fileLines.slice(0, index).some((s) => s === defStr)) {
// ------------------------------------------------------------------
// Equality helpers using the canonicalisation from find_context_core.
// (We duplicate a minimal version here because the scope is local.)
// ------------------------------------------------------------------
const canonLocal = (s: string): string =>
s.normalize("NFC").replace(
/./gu,
(c) =>
((
{
"-": "-",
"\u2010": "-",
"\u2011": "-",
"\u2012": "-",
"\u2013": "-",
"\u2014": "-",
"\u2212": "-",
"\u0022": '"',
"\u201C": '"',
"\u201D": '"',
"\u201E": '"',
"\u00AB": '"',
"\u00BB": '"',
"\u0027": "'",
"\u2018": "'",
"\u2019": "'",
"\u201B": "'",
} as Record<string, string>
)[c] ?? c),
);
if (
!fileLines
.slice(0, index)
.some((s) => canonLocal(s) === canonLocal(defStr))
) {
for (let i = index; i < fileLines.length; i++) {
if (fileLines[i] === defStr) {
if (canonLocal(fileLines[i]!) === canonLocal(defStr)) {
index = i + 1;
found = true;
break;
@@ -222,10 +257,14 @@ class Parser {
}
if (
!found &&
!fileLines.slice(0, index).some((s) => s.trim() === defStr.trim())
!fileLines
.slice(0, index)
.some((s) => canonLocal(s.trim()) === canonLocal(defStr.trim()))
) {
for (let i = index; i < fileLines.length; i++) {
if (fileLines[i]!.trim() === defStr.trim()) {
if (
canonLocal(fileLines[i]!.trim()) === canonLocal(defStr.trim())
) {
index = i + 1;
this.fuzz += 1;
found = true;
@@ -293,34 +332,94 @@ function find_context_core(
context: Array<string>,
start: number,
): [number, number] {
// ---------------------------------------------------------------------------
// Helpers Unicode punctuation normalisation
// ---------------------------------------------------------------------------
/*
* The patch-matching algorithm originally required **exact** string equality
* for non-whitespace characters. That breaks when the file on disk contains
* visually identical but different Unicode code-points (e.g. “EN DASH” vs
* ASCII "-"), because models almost always emit the ASCII variant. To make
* apply_patch resilient we canonicalise a handful of common punctuation
* look-alikes before doing comparisons.
*
* We purposefully keep the mapping *small* only characters that routinely
* appear in source files and are highly unlikely to introduce ambiguity are
* included. Each entry is written using the corresponding Unicode escape so
* that the file remains ASCII-only even after transpilation.
*/
const PUNCT_EQUIV: Record<string, string> = {
// Hyphen / dash variants --------------------------------------------------
/* U+002D HYPHEN-MINUS */ "-": "-",
/* U+2010 HYPHEN */ "\u2010": "-",
/* U+2011 NO-BREAK HYPHEN */ "\u2011": "-",
/* U+2012 FIGURE DASH */ "\u2012": "-",
/* U+2013 EN DASH */ "\u2013": "-",
/* U+2014 EM DASH */ "\u2014": "-",
/* U+2212 MINUS SIGN */ "\u2212": "-",
// Double quotes -----------------------------------------------------------
/* U+0022 QUOTATION MARK */ "\u0022": '"',
/* U+201C LEFT DOUBLE QUOTATION MARK */ "\u201C": '"',
/* U+201D RIGHT DOUBLE QUOTATION MARK */ "\u201D": '"',
/* U+201E DOUBLE LOW-9 QUOTATION MARK */ "\u201E": '"',
/* U+00AB LEFT-POINTING DOUBLE ANGLE QUOTATION MARK */ "\u00AB": '"',
/* U+00BB RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK */ "\u00BB": '"',
// Single quotes -----------------------------------------------------------
/* U+0027 APOSTROPHE */ "\u0027": "'",
/* U+2018 LEFT SINGLE QUOTATION MARK */ "\u2018": "'",
/* U+2019 RIGHT SINGLE QUOTATION MARK */ "\u2019": "'",
/* U+201B SINGLE HIGH-REVERSED-9 QUOTATION MARK */ "\u201B": "'",
};
const canon = (s: string): string =>
s
// Canonical Unicode composition first
.normalize("NFC")
// Replace punctuation look-alikes
.replace(/./gu, (c) => PUNCT_EQUIV[c] ?? c);
if (context.length === 0) {
return [start, 0];
}
// Pass 1 exact equality after canonicalisation ---------------------------
for (let i = start; i < lines.length; i++) {
if (lines.slice(i, i + context.length).join("\n") === context.join("\n")) {
const segment = canon(lines.slice(i, i + context.length).join("\n"));
if (segment === canon(context.join("\n"))) {
return [i, 0];
}
}
// Pass 2 ignore trailing whitespace -------------------------------------
for (let i = start; i < lines.length; i++) {
if (
const segment = canon(
lines
.slice(i, i + context.length)
.map((s) => s.trimEnd())
.join("\n") === context.map((s) => s.trimEnd()).join("\n")
) {
.join("\n"),
);
const ctx = canon(context.map((s) => s.trimEnd()).join("\n"));
if (segment === ctx) {
return [i, 1];
}
}
// Pass 3 ignore all surrounding whitespace ------------------------------
for (let i = start; i < lines.length; i++) {
if (
const segment = canon(
lines
.slice(i, i + context.length)
.map((s) => s.trim())
.join("\n") === context.map((s) => s.trim()).join("\n")
) {
.join("\n"),
);
const ctx = canon(context.map((s) => s.trim()).join("\n"));
if (segment === ctx) {
return [i, 100];
}
}
return [-1, 0];
}