Files
llmx/codex-rs/apply-patch/src/seek_sequence.rs
2025-04-25 16:01:58 -07:00

151 lines
5.5 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/// Attempt to find the sequence of `pattern` lines within `lines` beginning at or after `start`.
/// Returns the starting index of the match or `None` if not found. Matches are attempted with
/// decreasing strictness: exact match, then ignoring trailing whitespace, then ignoring leading
/// and trailing whitespace. When `eof` is true, we first try starting at the end-of-file (so that
/// patterns intended to match file endings are applied at the end), and fall back to searching
/// from `start` if needed.
///
/// Special cases handled defensively:
/// • Empty `pattern` → returns `Some(start)` (no-op match)
/// • `pattern.len() > lines.len()` → returns `None` (cannot match, avoids
/// outofbounds panic that occurred pre20250412)
pub(crate) fn seek_sequence(
lines: &[String],
pattern: &[String],
start: usize,
eof: bool,
) -> Option<usize> {
if pattern.is_empty() {
return Some(start);
}
// When the pattern is longer than the available input there is no possible
// match. Earlyreturn to avoid the outofbounds slice that would occur in
// the search loops below (previously caused a panic when
// `pattern.len() > lines.len()`).
if pattern.len() > lines.len() {
return None;
}
let search_start = if eof && lines.len() >= pattern.len() {
lines.len() - pattern.len()
} else {
start
};
// Exact match first.
for i in search_start..=lines.len().saturating_sub(pattern.len()) {
if lines[i..i + pattern.len()] == *pattern {
return Some(i);
}
}
// Then rstrip match.
for i in search_start..=lines.len().saturating_sub(pattern.len()) {
let mut ok = true;
for (p_idx, pat) in pattern.iter().enumerate() {
if lines[i + p_idx].trim_end() != pat.trim_end() {
ok = false;
break;
}
}
if ok {
return Some(i);
}
}
// Finally, trim both sides to allow more lenience.
for i in search_start..=lines.len().saturating_sub(pattern.len()) {
let mut ok = true;
for (p_idx, pat) in pattern.iter().enumerate() {
if lines[i + p_idx].trim() != pat.trim() {
ok = false;
break;
}
}
if ok {
return Some(i);
}
}
// ------------------------------------------------------------------
// Final, most permissive pass attempt to match after *normalising*
// common Unicode punctuation to their ASCII equivalents so that diffs
// authored with plain ASCII characters can still be applied to source
// files that contain typographic dashes / quotes, etc. This mirrors the
// fuzzy behaviour of `git apply` which ignores minor byte-level
// differences when locating context lines.
// ------------------------------------------------------------------
fn normalise(s: &str) -> String {
s.trim()
.chars()
.map(|c| match c {
// Various dash / hyphen code-points → ASCII '-'
'\u{2010}' | '\u{2011}' | '\u{2012}' | '\u{2013}' | '\u{2014}' | '\u{2015}'
| '\u{2212}' => '-',
// Fancy single quotes → '\''
'\u{2018}' | '\u{2019}' | '\u{201A}' | '\u{201B}' => '\'',
// Fancy double quotes → '"'
'\u{201C}' | '\u{201D}' | '\u{201E}' | '\u{201F}' => '"',
// Non-breaking space and other odd spaces → normal space
'\u{00A0}' | '\u{2002}' | '\u{2003}' | '\u{2004}' | '\u{2005}' | '\u{2006}'
| '\u{2007}' | '\u{2008}' | '\u{2009}' | '\u{200A}' | '\u{202F}' | '\u{205F}'
| '\u{3000}' => ' ',
other => other,
})
.collect::<String>()
}
for i in search_start..=lines.len().saturating_sub(pattern.len()) {
let mut ok = true;
for (p_idx, pat) in pattern.iter().enumerate() {
if normalise(&lines[i + p_idx]) != normalise(pat) {
ok = false;
break;
}
}
if ok {
return Some(i);
}
}
None
}
#[cfg(test)]
mod tests {
use super::seek_sequence;
fn to_vec(strings: &[&str]) -> Vec<String> {
strings.iter().map(|s| s.to_string()).collect()
}
#[test]
fn test_exact_match_finds_sequence() {
let lines = to_vec(&["foo", "bar", "baz"]);
let pattern = to_vec(&["bar", "baz"]);
assert_eq!(seek_sequence(&lines, &pattern, 0, false), Some(1));
}
#[test]
fn test_rstrip_match_ignores_trailing_whitespace() {
let lines = to_vec(&["foo ", "bar\t\t"]);
// Pattern omits trailing whitespace.
let pattern = to_vec(&["foo", "bar"]);
assert_eq!(seek_sequence(&lines, &pattern, 0, false), Some(0));
}
#[test]
fn test_trim_match_ignores_leading_and_trailing_whitespace() {
let lines = to_vec(&[" foo ", " bar\t"]);
// Pattern omits any additional whitespace.
let pattern = to_vec(&["foo", "bar"]);
assert_eq!(seek_sequence(&lines, &pattern, 0, false), Some(0));
}
#[test]
fn test_pattern_longer_than_input_returns_none() {
let lines = to_vec(&["just one line"]);
let pattern = to_vec(&["too", "many", "lines"]);
// Should not panic must return None when pattern cannot possibly fit.
assert_eq!(seek_sequence(&lines, &pattern, 0, false), None);
}
}