fix: handling weird unicode characters in apply_patch (#674)
I � unicode
This commit is contained in:
@@ -820,6 +820,51 @@ PATCH"#,
|
|||||||
assert_eq!(contents, "a\nB\nc\nd\nE\nf\ng\n");
|
assert_eq!(contents, "a\nB\nc\nd\nE\nf\ng\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Ensure that patches authored with ASCII characters can update lines that
|
||||||
|
/// contain typographic Unicode punctuation (e.g. EN DASH, NON-BREAKING
|
||||||
|
/// HYPHEN). Historically `git apply` succeeds in such scenarios but our
|
||||||
|
/// internal matcher failed requiring an exact byte-for-byte match. The
|
||||||
|
/// fuzzy-matching pass that normalises common punctuation should now bridge
|
||||||
|
/// the gap.
|
||||||
|
#[test]
|
||||||
|
fn test_update_line_with_unicode_dash() {
|
||||||
|
let dir = tempdir().unwrap();
|
||||||
|
let path = dir.path().join("unicode.py");
|
||||||
|
|
||||||
|
// Original line contains EN DASH (\u{2013}) and NON-BREAKING HYPHEN (\u{2011}).
|
||||||
|
let original = "import asyncio # local import \u{2013} avoids top\u{2011}level dep\n";
|
||||||
|
std::fs::write(&path, original).unwrap();
|
||||||
|
|
||||||
|
// Patch uses plain ASCII dash / hyphen.
|
||||||
|
let patch = wrap_patch(&format!(
|
||||||
|
r#"*** Update File: {}
|
||||||
|
@@
|
||||||
|
-import asyncio # local import - avoids top-level dep
|
||||||
|
+import asyncio # HELLO"#,
|
||||||
|
path.display()
|
||||||
|
));
|
||||||
|
|
||||||
|
let mut stdout = Vec::new();
|
||||||
|
let mut stderr = Vec::new();
|
||||||
|
apply_patch(&patch, &mut stdout, &mut stderr).unwrap();
|
||||||
|
|
||||||
|
// File should now contain the replaced comment.
|
||||||
|
let expected = "import asyncio # HELLO\n";
|
||||||
|
let contents = std::fs::read_to_string(&path).unwrap();
|
||||||
|
assert_eq!(contents, expected);
|
||||||
|
|
||||||
|
// Ensure success summary lists the file as modified.
|
||||||
|
let stdout_str = String::from_utf8(stdout).unwrap();
|
||||||
|
let expected_out = format!(
|
||||||
|
"Success. Updated the following files:\nM {}\n",
|
||||||
|
path.display()
|
||||||
|
);
|
||||||
|
assert_eq!(stdout_str, expected_out);
|
||||||
|
|
||||||
|
// No stderr expected.
|
||||||
|
assert_eq!(String::from_utf8(stderr).unwrap(), "");
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_unified_diff() {
|
fn test_unified_diff() {
|
||||||
// Start with a file containing four lines.
|
// Start with a file containing four lines.
|
||||||
|
|||||||
@@ -63,6 +63,49 @@ pub(crate) fn seek_sequence(
|
|||||||
return Some(i);
|
return Some(i);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ------------------------------------------------------------------
|
||||||
|
// Final, most permissive pass – attempt to match after *normalising*
|
||||||
|
// common Unicode punctuation to their ASCII equivalents so that diffs
|
||||||
|
// authored with plain ASCII characters can still be applied to source
|
||||||
|
// files that contain typographic dashes / quotes, etc. This mirrors the
|
||||||
|
// fuzzy behaviour of `git apply` which ignores minor byte-level
|
||||||
|
// differences when locating context lines.
|
||||||
|
// ------------------------------------------------------------------
|
||||||
|
|
||||||
|
fn normalise(s: &str) -> String {
|
||||||
|
s.trim()
|
||||||
|
.chars()
|
||||||
|
.map(|c| match c {
|
||||||
|
// Various dash / hyphen code-points → ASCII '-'
|
||||||
|
'\u{2010}' | '\u{2011}' | '\u{2012}' | '\u{2013}' | '\u{2014}' | '\u{2015}'
|
||||||
|
| '\u{2212}' => '-',
|
||||||
|
// Fancy single quotes → '\''
|
||||||
|
'\u{2018}' | '\u{2019}' | '\u{201A}' | '\u{201B}' => '\'',
|
||||||
|
// Fancy double quotes → '"'
|
||||||
|
'\u{201C}' | '\u{201D}' | '\u{201E}' | '\u{201F}' => '"',
|
||||||
|
// Non-breaking space and other odd spaces → normal space
|
||||||
|
'\u{00A0}' | '\u{2002}' | '\u{2003}' | '\u{2004}' | '\u{2005}' | '\u{2006}'
|
||||||
|
| '\u{2007}' | '\u{2008}' | '\u{2009}' | '\u{200A}' | '\u{202F}' | '\u{205F}'
|
||||||
|
| '\u{3000}' => ' ',
|
||||||
|
other => other,
|
||||||
|
})
|
||||||
|
.collect::<String>()
|
||||||
|
}
|
||||||
|
|
||||||
|
for i in search_start..=lines.len().saturating_sub(pattern.len()) {
|
||||||
|
let mut ok = true;
|
||||||
|
for (p_idx, pat) in pattern.iter().enumerate() {
|
||||||
|
if normalise(&lines[i + p_idx]) != normalise(pat) {
|
||||||
|
ok = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ok {
|
||||||
|
return Some(i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user