Files
llmx/codex-rs/apply-patch/src/seek_sequence.rs

152 lines
5.5 KiB
Rust
Raw Normal View History

/// Attempt to find the sequence of `pattern` lines within `lines` beginning at or after `start`.
/// Returns the starting index of the match or `None` if not found. Matches are attempted with
/// decreasing strictness: exact match, then ignoring trailing whitespace, then ignoring leading
/// and trailing whitespace. When `eof` is true, we first try starting at the end-of-file (so that
/// patterns intended to match file endings are applied at the end), and fall back to searching
/// from `start` if needed.
///
/// Special cases handled defensively:
/// • Empty `pattern` → returns `Some(start)` (no-op match)
/// • `pattern.len() > lines.len()` → returns `None` (cannot match, avoids
/// outofbounds panic that occurred pre20250412)
pub(crate) fn seek_sequence(
lines: &[String],
pattern: &[String],
start: usize,
eof: bool,
) -> Option<usize> {
if pattern.is_empty() {
return Some(start);
}
// When the pattern is longer than the available input there is no possible
// match. Earlyreturn to avoid the outofbounds slice that would occur in
// the search loops below (previously caused a panic when
// `pattern.len() > lines.len()`).
if pattern.len() > lines.len() {
return None;
}
let search_start = if eof && lines.len() >= pattern.len() {
lines.len() - pattern.len()
} else {
start
};
// Exact match first.
for i in search_start..=lines.len().saturating_sub(pattern.len()) {
if lines[i..i + pattern.len()] == *pattern {
return Some(i);
}
}
// Then rstrip match.
for i in search_start..=lines.len().saturating_sub(pattern.len()) {
let mut ok = true;
for (p_idx, pat) in pattern.iter().enumerate() {
if lines[i + p_idx].trim_end() != pat.trim_end() {
ok = false;
break;
}
}
if ok {
return Some(i);
}
}
// Finally, trim both sides to allow more lenience.
for i in search_start..=lines.len().saturating_sub(pattern.len()) {
let mut ok = true;
for (p_idx, pat) in pattern.iter().enumerate() {
if lines[i + p_idx].trim() != pat.trim() {
ok = false;
break;
}
}
if ok {
return Some(i);
}
}
// ------------------------------------------------------------------
// Final, most permissive pass attempt to match after *normalising*
// common Unicode punctuation to their ASCII equivalents so that diffs
// authored with plain ASCII characters can still be applied to source
// files that contain typographic dashes / quotes, etc. This mirrors the
// fuzzy behaviour of `git apply` which ignores minor byte-level
// differences when locating context lines.
// ------------------------------------------------------------------
fn normalise(s: &str) -> String {
s.trim()
.chars()
.map(|c| match c {
// Various dash / hyphen code-points → ASCII '-'
'\u{2010}' | '\u{2011}' | '\u{2012}' | '\u{2013}' | '\u{2014}' | '\u{2015}'
| '\u{2212}' => '-',
// Fancy single quotes → '\''
'\u{2018}' | '\u{2019}' | '\u{201A}' | '\u{201B}' => '\'',
// Fancy double quotes → '"'
'\u{201C}' | '\u{201D}' | '\u{201E}' | '\u{201F}' => '"',
// Non-breaking space and other odd spaces → normal space
'\u{00A0}' | '\u{2002}' | '\u{2003}' | '\u{2004}' | '\u{2005}' | '\u{2006}'
| '\u{2007}' | '\u{2008}' | '\u{2009}' | '\u{200A}' | '\u{202F}' | '\u{205F}'
| '\u{3000}' => ' ',
other => other,
})
.collect::<String>()
}
for i in search_start..=lines.len().saturating_sub(pattern.len()) {
let mut ok = true;
for (p_idx, pat) in pattern.iter().enumerate() {
if normalise(&lines[i + p_idx]) != normalise(pat) {
ok = false;
break;
}
}
if ok {
return Some(i);
}
}
None
}
#[cfg(test)]
mod tests {
use super::seek_sequence;
use std::string::ToString;
fn to_vec(strings: &[&str]) -> Vec<String> {
strings.iter().map(ToString::to_string).collect()
}
#[test]
fn test_exact_match_finds_sequence() {
let lines = to_vec(&["foo", "bar", "baz"]);
let pattern = to_vec(&["bar", "baz"]);
assert_eq!(seek_sequence(&lines, &pattern, 0, false), Some(1));
}
#[test]
fn test_rstrip_match_ignores_trailing_whitespace() {
let lines = to_vec(&["foo ", "bar\t\t"]);
// Pattern omits trailing whitespace.
let pattern = to_vec(&["foo", "bar"]);
assert_eq!(seek_sequence(&lines, &pattern, 0, false), Some(0));
}
#[test]
fn test_trim_match_ignores_leading_and_trailing_whitespace() {
let lines = to_vec(&[" foo ", " bar\t"]);
// Pattern omits any additional whitespace.
let pattern = to_vec(&["foo", "bar"]);
assert_eq!(seek_sequence(&lines, &pattern, 0, false), Some(0));
}
#[test]
fn test_pattern_longer_than_input_returns_none() {
let lines = to_vec(&["just one line"]);
let pattern = to_vec(&["too", "many", "lines"]);
// Should not panic must return None when pattern cannot possibly fit.
assert_eq!(seek_sequence(&lines, &pattern, 0, false), None);
}
}