cadmus_core/dictionary/
mod.rs

1//! A dict format (`*.dict`) reader crate.
2//!
3//! This crate can read dictionaries in the dict format, as used by dictd. It supports both
4//! uncompressed and compressed dictionaries.
5
6mod dictreader;
7mod errors;
8mod indexing;
9
10use std::path::Path;
11
12use self::dictreader::DictReader;
13use self::indexing::IndexReader;
14
15/// A dictionary wrapper.
16///
17/// A dictionary is made up of a `*.dict` or `*.dict.dz` file with the actual content and a
18/// `*.index` file with a list of all headwords and with positions in the dict file + length
19/// information. It provides a convenience function to look up headwords directly, without caring
20/// about the details of the index and the underlying dict format.
21pub struct Dictionary {
22    content: Box<dyn DictReader>,
23    index: Box<dyn IndexReader>,
24    metadata: Metadata,
25}
26
27/// The special metadata entries that we care about.
28///
29/// These entries should appear close to the beginning of the index file.
30pub struct Metadata {
31    pub all_chars: bool,
32    pub case_sensitive: bool,
33}
34
35impl Dictionary {
36    /// Look up a word in a dictionary.
37    ///
38    /// Words are looked up in the index and then retrieved from the dict file. If no word was
39    /// found, the returned vector is empty. Errors result from the parsing of the underlying files.
40    pub fn lookup(
41        &mut self,
42        word: &str,
43        fuzzy: bool,
44    ) -> Result<Vec<[String; 2]>, errors::DictError> {
45        let mut query = word.to_string();
46        if !self.metadata.case_sensitive {
47            query = query.to_lowercase();
48        }
49        if !self.metadata.all_chars {
50            query = query
51                .chars()
52                .filter(|c| c.is_alphanumeric() || c.is_whitespace())
53                .collect();
54        }
55        let entries = self.index.load_and_find(&query, fuzzy, &self.metadata);
56        let mut results = Vec::new();
57        for entry in entries.into_iter() {
58            results.push([
59                entry.original.unwrap_or(entry.headword),
60                self.content.fetch_definition(entry.offset, entry.size)?,
61            ]);
62        }
63        Ok(results)
64    }
65
66    /// Retreive metadata from the dictionaries.
67    ///
68    /// The metadata headwords start with `00-database-` or `00database`.
69    pub fn metadata(&mut self, name: &str) -> Result<String, errors::DictError> {
70        let mut query = format!("00-database-{}", name);
71        if !self.metadata.all_chars {
72            query = query.replace(|c: char| !c.is_alphanumeric(), "");
73        }
74        let entries = self.index.find(&query, false);
75        let entry = entries
76            .get(0)
77            .ok_or_else(|| errors::DictError::WordNotFound(name.into()))?;
78        self.content
79            .fetch_definition(entry.offset, entry.size)
80            .map(|def| {
81                let start = def
82                    .find('\n')
83                    .filter(|pos| *pos < def.len() - 1)
84                    .unwrap_or(0);
85                def[start..].trim().to_string()
86            })
87    }
88
89    /// Get the short name.
90    ///
91    /// This returns the short name of a dictionary. This corresponds to the
92    /// value passed to the `-s` option of `dictfmt`.
93    pub fn short_name(&mut self) -> Result<String, errors::DictError> {
94        self.metadata("short")
95    }
96
97    /// Get the URL.
98    ///
99    /// This returns the URL of a dictionary. This corresponds to the
100    /// value passed to the `-u` option of `dictfmt`.
101    pub fn url(&mut self) -> Result<String, errors::DictError> {
102        self.metadata("url")
103    }
104}
105
106/// Load dictionary from given paths
107///
108/// A dictionary is made of an index and a dictionary (data) file, both are opened from the given
109/// input file names. Gzipped files with the suffix `.dz` will be handled automatically.
110pub fn load_dictionary_from_file<P: AsRef<Path>>(
111    content_path: P,
112    index_path: P,
113) -> Result<Dictionary, errors::DictError> {
114    let content = dictreader::load_dict(content_path)?;
115    let index = Box::new(indexing::parse_index_from_file(index_path, true)?);
116    Ok(load_dictionary(content, index))
117}
118
119/// Load dictionary from given `DictReader` and `Index`.
120///
121/// A dictionary is made of an index and a dictionary (data). Both are required for look up. This
122/// function allows abstraction from the underlying source by only requiring a
123/// `dictReader` as trait object. This way, dictionaries from RAM or similar can be
124/// implemented.
125pub fn load_dictionary(content: Box<dyn DictReader>, index: Box<dyn IndexReader>) -> Dictionary {
126    let all_chars = !index.find("00-database-allchars", false).is_empty();
127    let word = if all_chars {
128        "00-database-case-sensitive"
129    } else {
130        "00databasecasesensitive"
131    };
132    let case_sensitive = !index.find(word, false).is_empty();
133    Dictionary {
134        content,
135        index,
136        metadata: Metadata {
137            all_chars,
138            case_sensitive,
139        },
140    }
141}
142
143#[cfg(test)]
144mod tests {
145    use super::*;
146
147    const PATH_CASE_SENSITIVE_DICT: &str = "src/dictionary/testdata/case_sensitive_dict.dict";
148    const PATH_CASE_SENSITIVE_INDEX: &str = "src/dictionary/testdata/case_sensitive_dict.index";
149    const PATH_CASE_INSENSITIVE_DICT: &str = "src/dictionary/testdata/case_insensitive_dict.dict";
150    const PATH_CASE_INSENSITIVE_INDEX: &str = "src/dictionary/testdata/case_insensitive_dict.index";
151
152    fn assert_dict_word_exists(
153        mut dict: Dictionary,
154        headword: &str,
155        definition: &str,
156    ) -> Dictionary {
157        let r = dict.lookup(headword, false);
158        assert!(r.is_ok());
159        let search = r.unwrap();
160        assert_eq!(search.len(), 1);
161        assert!(search[0][1].contains(definition));
162
163        dict
164    }
165
166    #[test]
167    fn test_load_dictionary_from_file() {
168        let r = load_dictionary_from_file(PATH_CASE_INSENSITIVE_DICT, PATH_CASE_INSENSITIVE_INDEX);
169        assert!(r.is_ok());
170    }
171
172    #[test]
173    fn test_dictionary_lookup_case_insensitive() {
174        let r = load_dictionary_from_file(PATH_CASE_INSENSITIVE_DICT, PATH_CASE_INSENSITIVE_INDEX);
175        let mut dict = r.unwrap();
176
177        dict = assert_dict_word_exists(dict, "bar", "test for case-sensitivity");
178        dict = assert_dict_word_exists(dict, "Bar", "test for case-sensitivity");
179        assert_dict_word_exists(dict, "straße", "test for non-latin case-sensitivity");
180    }
181
182    #[test]
183    fn test_dictionary_lookup_case_insensitive_fuzzy() {
184        let r = load_dictionary_from_file(PATH_CASE_INSENSITIVE_DICT, PATH_CASE_INSENSITIVE_INDEX);
185        let mut dict = r.unwrap();
186
187        let r = dict.lookup("ba", true);
188        assert!(r.is_ok());
189        let search = r.unwrap();
190        assert_eq!(search.len(), 1);
191        assert_eq!(search[0][0], "bar");
192        assert!(search[0][1].contains("test for case-sensitivity"));
193    }
194
195    #[test]
196    fn test_dictionary_lookup_case_sensitive() {
197        let r = load_dictionary_from_file(PATH_CASE_SENSITIVE_DICT, PATH_CASE_SENSITIVE_INDEX);
198        let mut dict = r.unwrap();
199
200        dict = assert_dict_word_exists(dict, "Bar", "test for case-sensitivity");
201        dict = assert_dict_word_exists(dict, "straße", "test for non-latin case-sensitivity");
202
203        let r = dict.lookup("bar", false);
204        assert!(r.unwrap().is_empty());
205
206        let r = dict.lookup("strasse", false);
207        assert!(r.unwrap().is_empty());
208    }
209
210    #[test]
211    fn test_dictionary_lookup_case_sensitive_fuzzy() {
212        let r = load_dictionary_from_file(PATH_CASE_SENSITIVE_DICT, PATH_CASE_SENSITIVE_INDEX);
213        let mut dict = r.unwrap();
214
215        let r = dict.lookup("Ba", true);
216        assert!(r.is_ok());
217        let search = r.unwrap();
218        assert_eq!(search.len(), 1);
219        assert_eq!(search[0][0], "Bar");
220        assert!(search[0][1].contains("test for case-sensitivity"));
221    }
222}