cadmus_core/dictionary/
indexing.rs

1//! Parse and decode `*.index` files.
2//!
3//! Each dictionary file (`*.dict.dz)`) is accompanied by a `*.index` file containing a list of
4//! words, together with its (byte) position in the dict file and its (byte) length. This module
5//! provides functions to parse this index file.
6//!
7//! The position and the length of a definition is given in a semi-base64 encoding. It uses all
8//! Latin letters (upper and lower case), all digits and additionally, `+` and `/`:
9//!
10//! `ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/`
11//!
12//! The calculation works as follows: `sum += x * 64^i`
13//!
14//! - `i` is the position within the string to calculate the number from and counts from right to
15//!   left, starting at 0.
16//! - `x` is the index within the array given above, i.e. `'a' == 26`.
17//!
18//! The sum makes up the index.
19
20use std::fs::File;
21use std::io::{BufRead, BufReader};
22use std::path::Path;
23
24use levenshtein::levenshtein;
25
26use super::errors::DictError;
27use super::errors::DictError::*;
28use super::Metadata;
29
30/// The index is partially loaded if `state` isn't `None`.
31pub struct Index<R: BufRead> {
32    pub entries: Vec<Entry>,
33    pub state: Option<R>,
34}
35
36#[derive(Debug, Clone)]
37pub struct Entry {
38    pub headword: String,
39    pub offset: u64,
40    pub size: u64,
41    pub original: Option<String>,
42}
43
44pub trait IndexReader {
45    fn load_and_find(&mut self, headword: &str, fuzzy: bool, metadata: &Metadata) -> Vec<Entry>;
46    fn find(&self, headword: &str, fuzzy: bool) -> Vec<Entry>;
47}
48
49fn normalize(entries: &[Entry], metadata: &Metadata) -> Vec<Entry> {
50    let mut result: Vec<Entry> = Vec::with_capacity(entries.len());
51
52    for entry in entries.iter() {
53        let mut headword = entry.headword.clone();
54
55        if !metadata.all_chars {
56            headword = headword
57                .chars()
58                .filter(|c| c.is_alphanumeric() || c.is_whitespace())
59                .collect();
60        }
61
62        if !metadata.case_sensitive {
63            headword = headword.to_lowercase();
64        }
65
66        let mut i = result.len();
67
68        while i > 0 && headword < result[i - 1].headword {
69            i -= 1;
70        }
71
72        let original = if headword != entry.headword {
73            Some(entry.headword.clone())
74        } else {
75            None
76        };
77
78        result.insert(
79            i,
80            Entry {
81                headword,
82                offset: entry.offset,
83                size: entry.size,
84                original,
85            },
86        );
87    }
88
89    result
90}
91
92impl<R: BufRead> IndexReader for Index<R> {
93    fn load_and_find(&mut self, headword: &str, fuzzy: bool, metadata: &Metadata) -> Vec<Entry> {
94        if let Some(br) = self.state.take() {
95            let has_dictfmt = self.entries.iter().any(|e| e.headword.contains("dictfmt"));
96            if let Ok(mut index) = parse_index(br, false) {
97                self.entries.append(&mut index.entries);
98                if !has_dictfmt {
99                    self.entries = normalize(&self.entries, metadata)
100                }
101            }
102        }
103        self.find(headword, fuzzy)
104    }
105
106    fn find(&self, headword: &str, fuzzy: bool) -> Vec<Entry> {
107        if fuzzy {
108            self.entries
109                .iter()
110                .filter(|entry| levenshtein(headword, &entry.headword) <= 1)
111                .cloned()
112                .collect()
113        } else {
114            if let Ok(mut i) = self
115                .entries
116                .binary_search_by_key(&headword, |entry| &entry.headword)
117            {
118                let mut results = vec![self.entries[i].clone()];
119                let j = i;
120                while i > 0 {
121                    i -= 1;
122                    if self.entries[i].headword != headword {
123                        break;
124                    }
125                    results.insert(0, self.entries[i].clone());
126                }
127                i = j;
128                while i < self.entries.len() - 1 {
129                    i += 1;
130                    if self.entries[i].headword != headword {
131                        break;
132                    }
133                    results.push(self.entries[i].clone());
134                }
135                results
136            } else {
137                Vec::new()
138            }
139        }
140    }
141}
142
143/// Get the assigned number for a character
144/// If the character was unknown, an empty Err(()) is returned.
145#[inline]
146fn get_base(input: char) -> Option<u64> {
147    match input {
148        'A'..='Z' => Some((input as u64) - 65), // 'A' should become 0
149        'a'..='z' => Some((input as u64) - 71), // 'a' should become 26, ...
150        '0'..='9' => Some((input as u64) + 4),  // 0 should become 52
151        '+' => Some(62),
152        '/' => Some(63),
153        _ => None,
154    }
155}
156
157/// Decode a number from a given String.
158///
159/// This function decodes a number from the format described in the module documentation. If
160/// unknown characters/bytes are encountered, a `DictError` is returned.
161pub fn decode_number(word: &str) -> Result<u64, DictError> {
162    let mut index = 0u64;
163    for (i, character) in word.chars().rev().enumerate() {
164        index += match get_base(character) {
165            Some(x) => x * 64u64.pow(i as u32),
166            None => return Err(InvalidCharacter(character, None, Some(i))),
167        };
168    }
169    Ok(index)
170}
171
172/// Parse a single line from the index file.
173fn parse_line(line: &str, line_number: usize) -> Result<(&str, u64, u64, Option<&str>), DictError> {
174    // First column: headword.
175    let mut split = line.split('\t');
176    let headword = split.next().ok_or(MissingColumnInIndex(line_number))?;
177
178    // Second column: offset into file.
179    let offset = split.next().ok_or(MissingColumnInIndex(line_number))?;
180    let offset = decode_number(offset)?;
181
182    // Third column: entry size.
183    let size = split.next().ok_or(MissingColumnInIndex(line_number))?;
184    let size = decode_number(size)?;
185
186    // Fourth column: optional original headword.
187    let original = split.next();
188
189    Ok((headword, offset, size, original))
190}
191
192/// Parse the index for a dictionary from a given BufRead compatible object.
193/// When `lazy` is `true`, the loop stops once all the metadata entries are parsed.
194pub fn parse_index<B: BufRead>(mut br: B, lazy: bool) -> Result<Index<B>, DictError> {
195    let mut info = false;
196    let mut entries = Vec::new();
197    let mut line_number = 0;
198    let mut line = String::new();
199
200    while let Ok(nb) = br.read_line(&mut line) {
201        if nb == 0 {
202            break;
203        }
204        let (headword, offset, size, original) = parse_line(line.trim_end(), line_number)?;
205
206        entries.push(Entry {
207            headword: headword.to_string(),
208            offset,
209            size,
210            original: original.map(String::from),
211        });
212
213        if lazy {
214            if !info && (headword.starts_with("00-database-") || headword.starts_with("00database"))
215            {
216                info = true;
217            } else if info
218                && !headword.starts_with("00-database-")
219                && !headword.starts_with("00database")
220            {
221                break;
222            }
223        }
224        line_number += 1;
225        line.clear();
226    }
227
228    let state = if lazy { Some(br) } else { None };
229
230    Ok(Index { entries, state })
231}
232
233/// Parse the index for a dictionary from a given path.
234pub fn parse_index_from_file<P: AsRef<Path>>(
235    path: P,
236    lazy: bool,
237) -> Result<Index<BufReader<File>>, DictError> {
238    let file = File::open(path)?;
239    let reader = BufReader::new(file);
240    parse_index(reader, lazy)
241}
242
243#[cfg(test)]
244mod tests {
245    use super::*;
246    use std::io::Empty;
247
248    const PATH_CASE_SENSITIVE_INDEX: &str = "src/dictionary/testdata/case_sensitive_dict.index";
249    const PATH_CASE_INSENSITIVE_INDEX: &str = "src/dictionary/testdata/case_insensitive_dict.index";
250
251    #[test]
252    fn test_index_find() {
253        let words = vec![
254            Entry {
255                headword: String::from("bar"),
256                offset: 0,
257                size: 8,
258                original: None,
259            },
260            Entry {
261                headword: String::from("baz"),
262                offset: 8,
263                size: 4,
264                original: None,
265            },
266            Entry {
267                headword: String::from("foo"),
268                offset: 12,
269                size: 4,
270                original: None,
271            },
272        ];
273
274        let index: Index<Empty> = Index {
275            entries: words,
276            state: None,
277        };
278
279        let r = index.find("apples", false);
280        assert!(r.is_empty());
281
282        let r = index.find("baz", false);
283        assert!(!r.is_empty());
284        assert_eq!(r.len(), 1);
285        assert_eq!(r.first().unwrap().headword, "baz");
286
287        let r = index.find("bas", true);
288        assert!(!r.is_empty());
289        assert_eq!(r.len(), 2);
290        assert_eq!(r.first().unwrap().headword, "bar");
291    }
292
293    #[test]
294    // Make sure that a lazy load does not inadvertently skip a word when it returns to BufRead
295    fn test_index_load_and_find() {
296        let r = parse_index_from_file(PATH_CASE_INSENSITIVE_INDEX, true);
297        assert!(r.is_ok());
298
299        let mut index = r.unwrap();
300        assert_eq!(index.entries[0].headword, "00-database-allchars");
301        assert_eq!(index.entries.last().unwrap().headword, "bar");
302
303        let r = index.load_and_find(
304            "bar",
305            false,
306            &Metadata {
307                all_chars: true,
308                case_sensitive: false,
309            },
310        );
311        assert!(!r.is_empty());
312
313        let r = index.load_and_find(
314            "foo",
315            false,
316            &Metadata {
317                all_chars: true,
318                case_sensitive: false,
319            },
320        );
321        assert!(!r.is_empty());
322    }
323
324    #[test]
325    fn test_parse_index_from_file() {
326        let r = parse_index_from_file(PATH_CASE_INSENSITIVE_INDEX, false);
327        assert!(r.is_ok());
328
329        let index = r.unwrap();
330        assert_eq!(index.entries[0].headword, "00-database-allchars");
331        assert_eq!(index.entries.last().unwrap().headword, "あいおい");
332    }
333
334    #[test]
335    fn test_parse_index_from_file_lazy() {
336        let r = parse_index_from_file(PATH_CASE_INSENSITIVE_INDEX, true);
337        assert!(r.is_ok());
338
339        let index = r.unwrap();
340        assert_eq!(index.entries[0].headword, "00-database-allchars");
341        assert_eq!(index.entries.last().unwrap().headword, "bar");
342    }
343
344    #[test]
345    fn test_parse_index_from_file_handles_case_insensitivity() {
346        let r = parse_index_from_file(PATH_CASE_INSENSITIVE_INDEX, false);
347        assert!(r.is_ok());
348
349        let index = r.unwrap();
350
351        let r = index.find("bar", false);
352        assert!(!r.is_empty());
353        assert_eq!(r.first().unwrap().headword, "bar");
354    }
355
356    #[test]
357    fn test_parse_index_from_file_handles_case_sensitivity() {
358        let r = parse_index_from_file(PATH_CASE_SENSITIVE_INDEX, false);
359        assert!(r.is_ok());
360
361        let index = r.unwrap();
362
363        let r = index.find("Bar", false);
364        assert!(!r.is_empty());
365        assert_eq!(r.first().unwrap().headword, "Bar");
366    }
367}