178 lines
6.1 KiB
Rust
178 lines
6.1 KiB
Rust
/// Simple case-insensitive subsequence matcher used for fuzzy filtering.
|
|
///
|
|
/// Returns the indices (character positions) of the matched characters in the
|
|
/// ORIGINAL `haystack` string and a score where smaller is better.
|
|
///
|
|
/// Unicode correctness: we perform the match on a lowercased copy of the
|
|
/// haystack and needle but maintain a mapping from each character in the
|
|
/// lowercased haystack back to the original character index in `haystack`.
|
|
/// This ensures the returned indices can be safely used with
|
|
/// `str::chars().enumerate()` consumers for highlighting, even when
|
|
/// lowercasing expands certain characters (e.g., ß → ss, İ → i̇).
|
|
pub fn fuzzy_match(haystack: &str, needle: &str) -> Option<(Vec<usize>, i32)> {
|
|
if needle.is_empty() {
|
|
return Some((Vec::new(), i32::MAX));
|
|
}
|
|
|
|
let mut lowered_chars: Vec<char> = Vec::new();
|
|
let mut lowered_to_orig_char_idx: Vec<usize> = Vec::new();
|
|
for (orig_idx, ch) in haystack.chars().enumerate() {
|
|
for lc in ch.to_lowercase() {
|
|
lowered_chars.push(lc);
|
|
lowered_to_orig_char_idx.push(orig_idx);
|
|
}
|
|
}
|
|
|
|
let lowered_needle: Vec<char> = needle.to_lowercase().chars().collect();
|
|
|
|
let mut result_orig_indices: Vec<usize> = Vec::with_capacity(lowered_needle.len());
|
|
let mut last_lower_pos: Option<usize> = None;
|
|
let mut cur = 0usize;
|
|
for &nc in lowered_needle.iter() {
|
|
let mut found_at: Option<usize> = None;
|
|
while cur < lowered_chars.len() {
|
|
if lowered_chars[cur] == nc {
|
|
found_at = Some(cur);
|
|
cur += 1;
|
|
break;
|
|
}
|
|
cur += 1;
|
|
}
|
|
let pos = found_at?;
|
|
result_orig_indices.push(lowered_to_orig_char_idx[pos]);
|
|
last_lower_pos = Some(pos);
|
|
}
|
|
|
|
let first_lower_pos = if result_orig_indices.is_empty() {
|
|
0usize
|
|
} else {
|
|
let target_orig = result_orig_indices[0];
|
|
lowered_to_orig_char_idx
|
|
.iter()
|
|
.position(|&oi| oi == target_orig)
|
|
.unwrap_or(0)
|
|
};
|
|
// last defaults to first for single-hit; score = extra span between first/last hit
|
|
// minus needle len (≥0).
|
|
// Strongly reward prefix matches by subtracting 100 when the first hit is at index 0.
|
|
let last_lower_pos = last_lower_pos.unwrap_or(first_lower_pos);
|
|
let window =
|
|
(last_lower_pos as i32 - first_lower_pos as i32 + 1) - (lowered_needle.len() as i32);
|
|
let mut score = window.max(0);
|
|
if first_lower_pos == 0 {
|
|
score -= 100;
|
|
}
|
|
|
|
result_orig_indices.sort_unstable();
|
|
result_orig_indices.dedup();
|
|
Some((result_orig_indices, score))
|
|
}
|
|
|
|
/// Convenience wrapper to get only the indices for a fuzzy match.
|
|
pub fn fuzzy_indices(haystack: &str, needle: &str) -> Option<Vec<usize>> {
|
|
fuzzy_match(haystack, needle).map(|(mut idx, _)| {
|
|
idx.sort_unstable();
|
|
idx.dedup();
|
|
idx
|
|
})
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn ascii_basic_indices() {
|
|
let (idx, score) = match fuzzy_match("hello", "hl") {
|
|
Some(v) => v,
|
|
None => panic!("expected a match"),
|
|
};
|
|
assert_eq!(idx, vec![0, 2]);
|
|
// 'h' at 0, 'l' at 2 -> window 1; start-of-string bonus applies (-100)
|
|
assert_eq!(score, -99);
|
|
}
|
|
|
|
#[test]
|
|
fn unicode_dotted_i_istanbul_highlighting() {
|
|
let (idx, score) = match fuzzy_match("İstanbul", "is") {
|
|
Some(v) => v,
|
|
None => panic!("expected a match"),
|
|
};
|
|
assert_eq!(idx, vec![0, 1]);
|
|
// Matches at lowered positions 0 and 2 -> window 1; start-of-string bonus applies
|
|
assert_eq!(score, -99);
|
|
}
|
|
|
|
#[test]
|
|
fn unicode_german_sharp_s_casefold() {
|
|
assert!(fuzzy_match("straße", "strasse").is_none());
|
|
}
|
|
|
|
#[test]
|
|
fn prefer_contiguous_match_over_spread() {
|
|
let (_idx_a, score_a) = match fuzzy_match("abc", "abc") {
|
|
Some(v) => v,
|
|
None => panic!("expected a match"),
|
|
};
|
|
let (_idx_b, score_b) = match fuzzy_match("a-b-c", "abc") {
|
|
Some(v) => v,
|
|
None => panic!("expected a match"),
|
|
};
|
|
// Contiguous window -> 0; start-of-string bonus -> -100
|
|
assert_eq!(score_a, -100);
|
|
// Spread over 5 chars for 3-letter needle -> window 2; with bonus -> -98
|
|
assert_eq!(score_b, -98);
|
|
assert!(score_a < score_b);
|
|
}
|
|
|
|
#[test]
|
|
fn start_of_string_bonus_applies() {
|
|
let (_idx_a, score_a) = match fuzzy_match("file_name", "file") {
|
|
Some(v) => v,
|
|
None => panic!("expected a match"),
|
|
};
|
|
let (_idx_b, score_b) = match fuzzy_match("my_file_name", "file") {
|
|
Some(v) => v,
|
|
None => panic!("expected a match"),
|
|
};
|
|
// Start-of-string contiguous -> window 0; bonus -> -100
|
|
assert_eq!(score_a, -100);
|
|
// Non-prefix contiguous -> window 0; no bonus -> 0
|
|
assert_eq!(score_b, 0);
|
|
assert!(score_a < score_b);
|
|
}
|
|
|
|
#[test]
|
|
fn empty_needle_matches_with_max_score_and_no_indices() {
|
|
let (idx, score) = match fuzzy_match("anything", "") {
|
|
Some(v) => v,
|
|
None => panic!("empty needle should match"),
|
|
};
|
|
assert!(idx.is_empty());
|
|
assert_eq!(score, i32::MAX);
|
|
}
|
|
|
|
#[test]
|
|
fn case_insensitive_matching_basic() {
|
|
let (idx, score) = match fuzzy_match("FooBar", "foO") {
|
|
Some(v) => v,
|
|
None => panic!("expected a match"),
|
|
};
|
|
assert_eq!(idx, vec![0, 1, 2]);
|
|
// Contiguous prefix match (case-insensitive) -> window 0 with bonus
|
|
assert_eq!(score, -100);
|
|
}
|
|
|
|
#[test]
|
|
fn indices_are_deduped_for_multichar_lowercase_expansion() {
|
|
let needle = "\u{0069}\u{0307}"; // "i" + combining dot above
|
|
let (idx, score) = match fuzzy_match("İ", needle) {
|
|
Some(v) => v,
|
|
None => panic!("expected a match"),
|
|
};
|
|
assert_eq!(idx, vec![0]);
|
|
// Lowercasing 'İ' expands to two chars; contiguous prefix -> window 0 with bonus
|
|
assert_eq!(score, -100);
|
|
}
|
|
}
|