fix: handling weird unicode characters in apply_patch (#674)
I � unicode
This commit is contained in:
@@ -63,6 +63,49 @@ pub(crate) fn seek_sequence(
|
||||
return Some(i);
|
||||
}
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------
|
||||
// Final, most permissive pass – attempt to match after *normalising*
|
||||
// common Unicode punctuation to their ASCII equivalents so that diffs
|
||||
// authored with plain ASCII characters can still be applied to source
|
||||
// files that contain typographic dashes / quotes, etc. This mirrors the
|
||||
// fuzzy behaviour of `git apply` which ignores minor byte-level
|
||||
// differences when locating context lines.
|
||||
// ------------------------------------------------------------------
|
||||
|
||||
fn normalise(s: &str) -> String {
|
||||
s.trim()
|
||||
.chars()
|
||||
.map(|c| match c {
|
||||
// Various dash / hyphen code-points → ASCII '-'
|
||||
'\u{2010}' | '\u{2011}' | '\u{2012}' | '\u{2013}' | '\u{2014}' | '\u{2015}'
|
||||
| '\u{2212}' => '-',
|
||||
// Fancy single quotes → '\''
|
||||
'\u{2018}' | '\u{2019}' | '\u{201A}' | '\u{201B}' => '\'',
|
||||
// Fancy double quotes → '"'
|
||||
'\u{201C}' | '\u{201D}' | '\u{201E}' | '\u{201F}' => '"',
|
||||
// Non-breaking space and other odd spaces → normal space
|
||||
'\u{00A0}' | '\u{2002}' | '\u{2003}' | '\u{2004}' | '\u{2005}' | '\u{2006}'
|
||||
| '\u{2007}' | '\u{2008}' | '\u{2009}' | '\u{200A}' | '\u{202F}' | '\u{205F}'
|
||||
| '\u{3000}' => ' ',
|
||||
other => other,
|
||||
})
|
||||
.collect::<String>()
|
||||
}
|
||||
|
||||
for i in search_start..=lines.len().saturating_sub(pattern.len()) {
|
||||
let mut ok = true;
|
||||
for (p_idx, pat) in pattern.iter().enumerate() {
|
||||
if normalise(&lines[i + p_idx]) != normalise(pat) {
|
||||
ok = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ok {
|
||||
return Some(i);
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user