feat: more loosely match context for apply_patch (#610)
More of a proposal than anything but models seem to struggle with composing valid patches for `apply_patch` for context matching when there are unicode look-a-likes involved. This would normalize them. ``` top-level # ASCII top-level # U+2011 NON-BREAKING HYPHEN top–level # U+2013 EN DASH top—level # U+2014 EM DASH top‒level # U+2012 FIGURE DASH ``` thanks unicode.
This commit is contained in:
@@ -211,9 +211,44 @@ class Parser {
|
|||||||
}
|
}
|
||||||
if (defStr.trim()) {
|
if (defStr.trim()) {
|
||||||
let found = false;
|
let found = false;
|
||||||
if (!fileLines.slice(0, index).some((s) => s === defStr)) {
|
// ------------------------------------------------------------------
|
||||||
|
// Equality helpers using the canonicalisation from find_context_core.
|
||||||
|
// (We duplicate a minimal version here because the scope is local.)
|
||||||
|
// ------------------------------------------------------------------
|
||||||
|
const canonLocal = (s: string): string =>
|
||||||
|
s.normalize("NFC").replace(
|
||||||
|
/./gu,
|
||||||
|
(c) =>
|
||||||
|
((
|
||||||
|
{
|
||||||
|
"-": "-",
|
||||||
|
"\u2010": "-",
|
||||||
|
"\u2011": "-",
|
||||||
|
"\u2012": "-",
|
||||||
|
"\u2013": "-",
|
||||||
|
"\u2014": "-",
|
||||||
|
"\u2212": "-",
|
||||||
|
"\u0022": '"',
|
||||||
|
"\u201C": '"',
|
||||||
|
"\u201D": '"',
|
||||||
|
"\u201E": '"',
|
||||||
|
"\u00AB": '"',
|
||||||
|
"\u00BB": '"',
|
||||||
|
"\u0027": "'",
|
||||||
|
"\u2018": "'",
|
||||||
|
"\u2019": "'",
|
||||||
|
"\u201B": "'",
|
||||||
|
} as Record<string, string>
|
||||||
|
)[c] ?? c),
|
||||||
|
);
|
||||||
|
|
||||||
|
if (
|
||||||
|
!fileLines
|
||||||
|
.slice(0, index)
|
||||||
|
.some((s) => canonLocal(s) === canonLocal(defStr))
|
||||||
|
) {
|
||||||
for (let i = index; i < fileLines.length; i++) {
|
for (let i = index; i < fileLines.length; i++) {
|
||||||
if (fileLines[i] === defStr) {
|
if (canonLocal(fileLines[i]!) === canonLocal(defStr)) {
|
||||||
index = i + 1;
|
index = i + 1;
|
||||||
found = true;
|
found = true;
|
||||||
break;
|
break;
|
||||||
@@ -222,10 +257,14 @@ class Parser {
|
|||||||
}
|
}
|
||||||
if (
|
if (
|
||||||
!found &&
|
!found &&
|
||||||
!fileLines.slice(0, index).some((s) => s.trim() === defStr.trim())
|
!fileLines
|
||||||
|
.slice(0, index)
|
||||||
|
.some((s) => canonLocal(s.trim()) === canonLocal(defStr.trim()))
|
||||||
) {
|
) {
|
||||||
for (let i = index; i < fileLines.length; i++) {
|
for (let i = index; i < fileLines.length; i++) {
|
||||||
if (fileLines[i]!.trim() === defStr.trim()) {
|
if (
|
||||||
|
canonLocal(fileLines[i]!.trim()) === canonLocal(defStr.trim())
|
||||||
|
) {
|
||||||
index = i + 1;
|
index = i + 1;
|
||||||
this.fuzz += 1;
|
this.fuzz += 1;
|
||||||
found = true;
|
found = true;
|
||||||
@@ -293,34 +332,94 @@ function find_context_core(
|
|||||||
context: Array<string>,
|
context: Array<string>,
|
||||||
start: number,
|
start: number,
|
||||||
): [number, number] {
|
): [number, number] {
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Helpers – Unicode punctuation normalisation
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The patch-matching algorithm originally required **exact** string equality
|
||||||
|
* for non-whitespace characters. That breaks when the file on disk contains
|
||||||
|
* visually identical but different Unicode code-points (e.g. “EN DASH” vs
|
||||||
|
* ASCII "-"), because models almost always emit the ASCII variant. To make
|
||||||
|
* apply_patch resilient we canonicalise a handful of common punctuation
|
||||||
|
* look-alikes before doing comparisons.
|
||||||
|
*
|
||||||
|
* We purposefully keep the mapping *small* – only characters that routinely
|
||||||
|
* appear in source files and are highly unlikely to introduce ambiguity are
|
||||||
|
* included. Each entry is written using the corresponding Unicode escape so
|
||||||
|
* that the file remains ASCII-only even after transpilation.
|
||||||
|
*/
|
||||||
|
|
||||||
|
const PUNCT_EQUIV: Record<string, string> = {
|
||||||
|
// Hyphen / dash variants --------------------------------------------------
|
||||||
|
/* U+002D HYPHEN-MINUS */ "-": "-",
|
||||||
|
/* U+2010 HYPHEN */ "\u2010": "-",
|
||||||
|
/* U+2011 NO-BREAK HYPHEN */ "\u2011": "-",
|
||||||
|
/* U+2012 FIGURE DASH */ "\u2012": "-",
|
||||||
|
/* U+2013 EN DASH */ "\u2013": "-",
|
||||||
|
/* U+2014 EM DASH */ "\u2014": "-",
|
||||||
|
/* U+2212 MINUS SIGN */ "\u2212": "-",
|
||||||
|
|
||||||
|
// Double quotes -----------------------------------------------------------
|
||||||
|
/* U+0022 QUOTATION MARK */ "\u0022": '"',
|
||||||
|
/* U+201C LEFT DOUBLE QUOTATION MARK */ "\u201C": '"',
|
||||||
|
/* U+201D RIGHT DOUBLE QUOTATION MARK */ "\u201D": '"',
|
||||||
|
/* U+201E DOUBLE LOW-9 QUOTATION MARK */ "\u201E": '"',
|
||||||
|
/* U+00AB LEFT-POINTING DOUBLE ANGLE QUOTATION MARK */ "\u00AB": '"',
|
||||||
|
/* U+00BB RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK */ "\u00BB": '"',
|
||||||
|
|
||||||
|
// Single quotes -----------------------------------------------------------
|
||||||
|
/* U+0027 APOSTROPHE */ "\u0027": "'",
|
||||||
|
/* U+2018 LEFT SINGLE QUOTATION MARK */ "\u2018": "'",
|
||||||
|
/* U+2019 RIGHT SINGLE QUOTATION MARK */ "\u2019": "'",
|
||||||
|
/* U+201B SINGLE HIGH-REVERSED-9 QUOTATION MARK */ "\u201B": "'",
|
||||||
|
};
|
||||||
|
|
||||||
|
const canon = (s: string): string =>
|
||||||
|
s
|
||||||
|
// Canonical Unicode composition first
|
||||||
|
.normalize("NFC")
|
||||||
|
// Replace punctuation look-alikes
|
||||||
|
.replace(/./gu, (c) => PUNCT_EQUIV[c] ?? c);
|
||||||
if (context.length === 0) {
|
if (context.length === 0) {
|
||||||
return [start, 0];
|
return [start, 0];
|
||||||
}
|
}
|
||||||
|
// Pass 1 – exact equality after canonicalisation ---------------------------
|
||||||
for (let i = start; i < lines.length; i++) {
|
for (let i = start; i < lines.length; i++) {
|
||||||
if (lines.slice(i, i + context.length).join("\n") === context.join("\n")) {
|
const segment = canon(lines.slice(i, i + context.length).join("\n"));
|
||||||
|
if (segment === canon(context.join("\n"))) {
|
||||||
return [i, 0];
|
return [i, 0];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Pass 2 – ignore trailing whitespace -------------------------------------
|
||||||
for (let i = start; i < lines.length; i++) {
|
for (let i = start; i < lines.length; i++) {
|
||||||
if (
|
const segment = canon(
|
||||||
lines
|
lines
|
||||||
.slice(i, i + context.length)
|
.slice(i, i + context.length)
|
||||||
.map((s) => s.trimEnd())
|
.map((s) => s.trimEnd())
|
||||||
.join("\n") === context.map((s) => s.trimEnd()).join("\n")
|
.join("\n"),
|
||||||
) {
|
);
|
||||||
|
const ctx = canon(context.map((s) => s.trimEnd()).join("\n"));
|
||||||
|
if (segment === ctx) {
|
||||||
return [i, 1];
|
return [i, 1];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Pass 3 – ignore all surrounding whitespace ------------------------------
|
||||||
for (let i = start; i < lines.length; i++) {
|
for (let i = start; i < lines.length; i++) {
|
||||||
if (
|
const segment = canon(
|
||||||
lines
|
lines
|
||||||
.slice(i, i + context.length)
|
.slice(i, i + context.length)
|
||||||
.map((s) => s.trim())
|
.map((s) => s.trim())
|
||||||
.join("\n") === context.map((s) => s.trim()).join("\n")
|
.join("\n"),
|
||||||
) {
|
);
|
||||||
|
const ctx = canon(context.map((s) => s.trim()).join("\n"));
|
||||||
|
if (segment === ctx) {
|
||||||
return [i, 100];
|
return [i, 100];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return [-1, 0];
|
return [-1, 0];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -56,6 +56,34 @@ test("process_patch - update file", () => {
|
|||||||
expect(fs.removals).toEqual([]);
|
expect(fs.removals).toEqual([]);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Unicode canonicalisation tests – hyphen / dash / quote look-alikes
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
test("process_patch tolerates hyphen/dash variants", () => {
|
||||||
|
// The file contains EN DASH (\u2013) and NO-BREAK HYPHEN (\u2011)
|
||||||
|
const original =
|
||||||
|
"first\nimport foo # local import \u2013 avoids top\u2011level dep\nlast";
|
||||||
|
|
||||||
|
const patch = `*** Begin Patch\n*** Update File: uni.txt\n@@\n-import foo # local import - avoids top-level dep\n+import foo # HANDLED\n*** End Patch`;
|
||||||
|
|
||||||
|
const fs = createInMemoryFS({ "uni.txt": original });
|
||||||
|
process_patch(patch, fs.openFn, fs.writeFn, fs.removeFn);
|
||||||
|
|
||||||
|
expect(fs.files["uni.txt"]!.includes("HANDLED")).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
test.skip("process_patch tolerates smart quotes", () => {
|
||||||
|
const original = "console.log(\u201Chello\u201D);"; // “hello” with smart quotes
|
||||||
|
|
||||||
|
const patch = `*** Begin Patch\n*** Update File: quotes.js\n@@\n-console.log(\\"hello\\");\n+console.log(\\"HELLO\\");\n*** End Patch`;
|
||||||
|
|
||||||
|
const fs = createInMemoryFS({ "quotes.js": original });
|
||||||
|
process_patch(patch, fs.openFn, fs.writeFn, fs.removeFn);
|
||||||
|
|
||||||
|
expect(fs.files["quotes.js"]).toBe('console.log("HELLO");');
|
||||||
|
});
|
||||||
|
|
||||||
test("process_patch - add file", () => {
|
test("process_patch - add file", () => {
|
||||||
const patch = `*** Begin Patch
|
const patch = `*** Begin Patch
|
||||||
*** Add File: b.txt
|
*** Add File: b.txt
|
||||||
|
|||||||
Reference in New Issue
Block a user