Skip to main content

cadmus_core/dictionary/
indexing.rs

1//! Shared types for dictionary index readers.
2
3use super::Metadata;
4
5#[derive(Debug, Clone)]
6pub struct Entry {
7    pub headword: String,
8    pub offset: u64,
9    pub size: u64,
10    pub original: Option<String>,
11}
12
13pub trait IndexReader {
14    fn load_and_find(&mut self, headword: &str, fuzzy: bool, metadata: &Metadata) -> Vec<Entry>;
15    fn find(&self, headword: &str, fuzzy: bool) -> Vec<Entry>;
16}
17
18/// Applies case and character normalization to a headword.
19///
20/// Used at index time to normalize stored words and at query time to normalize
21/// the lookup term so both sides use identical transformations.
22pub(crate) fn apply_transform(
23    headword: &str,
24    needs_char_filter: bool,
25    needs_lowercase: bool,
26) -> String {
27    let filtered: String = if needs_char_filter {
28        headword
29            .chars()
30            .filter(|c| c.is_alphanumeric() || c.is_whitespace())
31            .collect()
32    } else {
33        headword.to_owned()
34    };
35
36    if needs_lowercase {
37        filtered.to_lowercase()
38    } else {
39        filtered
40    }
41}
42
43fn normalize_internal(entries: &[Entry], metadata: &Metadata) -> Vec<Entry> {
44    let needs_char_filter = !metadata.all_chars;
45    let needs_lowercase = !metadata.case_sensitive;
46
47    if !needs_char_filter && !needs_lowercase && is_sorted(entries) {
48        return entries.to_vec();
49    }
50
51    let mut result: Vec<Entry> = entries
52        .iter()
53        .map(|entry| {
54            let transformed = apply_transform(&entry.headword, needs_char_filter, needs_lowercase);
55            let original = if transformed != entry.headword {
56                Some(entry.headword.clone())
57            } else {
58                None
59            };
60            Entry {
61                headword: transformed,
62                offset: entry.offset,
63                size: entry.size,
64                original,
65            }
66        })
67        .collect();
68
69    if is_sorted(&result) {
70        return result;
71    }
72
73    result.sort_by_cached_key(|e| e.headword.clone());
74    result
75}
76
77fn is_sorted(entries: &[Entry]) -> bool {
78    entries.windows(2).all(|w| w[0].headword <= w[1].headword)
79}
80
81/// Normalize entries based on dictionary metadata.
82///
83/// If no normalization is needed and the entries are already sorted, returns
84/// the original entries unchanged. Otherwise transforms headwords (lowercasing
85/// and/or stripping non-alphanumeric characters) and sorts by headword.
86#[cfg(feature = "bench")]
87pub fn normalize(entries: &[Entry], metadata: &Metadata) -> Vec<Entry> {
88    normalize_internal(entries, metadata)
89}
90
91#[cfg(not(feature = "bench"))]
92pub(crate) fn normalize(entries: &[Entry], metadata: &Metadata) -> Vec<Entry> {
93    normalize_internal(entries, metadata)
94}