Skip to main content

cadmus_core/dictionary/
mod.rs

1//! A dict format (`*.dict`) reader crate.
2//!
3//! This crate can read dictionaries in the dict format, as used by dictd. It supports both
4//! uncompressed and compressed dictionaries.
5//!
6//! It also provides support for downloading dictionaries from the monolingual project.
7
8mod dictreader;
9mod errors;
10#[cfg(feature = "bench")]
11pub mod indexing;
12#[cfg(not(feature = "bench"))]
13mod indexing;
14
15pub(crate) mod db_index;
16mod monolingual;
17
18pub(crate) use monolingual::MonolingualDictionaryService;
19
20use std::path::Path;
21
22use self::dictreader::DictReader;
23use self::indexing::IndexReader;
24pub(crate) use self::indexing::{apply_transform, normalize, Entry};
25use crate::db::Database;
26use crate::helpers::Fp;
27
28/// A dictionary wrapper.
29///
30/// A dictionary is made up of a `*.dict` or `*.dict.dz` file with the actual content and a
31/// `*.index` file with a list of all headwords and with positions in the dict file + length
32/// information. It provides a convenience function to look up headwords directly, without caring
33/// about the details of the index and the underlying dict format.
34pub struct Dictionary {
35    content: Box<dyn DictReader>,
36    index: Box<dyn IndexReader>,
37    metadata: Metadata,
38}
39
40/// The special metadata entries that we care about.
41///
42/// These entries should appear close to the beginning of the index file.
43pub struct Metadata {
44    pub all_chars: bool,
45    pub case_sensitive: bool,
46}
47
48impl Dictionary {
49    /// Look up a word in a dictionary.
50    ///
51    /// Words are looked up in the index and then retrieved from the dict file. If no word was
52    /// found, the returned vector is empty. Errors result from the parsing of the underlying files.
53    ///
54    /// Normalization (lowercasing, char filtering) is applied at index time, so the query word
55    /// must be normalized the same way before calling this method.
56    #[cfg_attr(feature = "tracing", tracing::instrument(skip(self), fields(word = %word, fuzzy)))]
57    pub fn lookup(
58        &mut self,
59        word: &str,
60        fuzzy: bool,
61    ) -> Result<Vec<[String; 2]>, errors::DictError> {
62        let query = apply_transform(
63            word,
64            !self.metadata.all_chars,
65            !self.metadata.case_sensitive,
66        );
67        let entries = self.index.load_and_find(&query, fuzzy, &self.metadata);
68        let mut results = Vec::new();
69        for entry in entries.into_iter() {
70            results.push([
71                entry.original.unwrap_or(entry.headword),
72                self.content.fetch_definition(entry.offset, entry.size)?,
73            ]);
74        }
75        Ok(results)
76    }
77
78    /// Retreive metadata from the dictionaries.
79    ///
80    /// The metadata headwords start with `00-database-` or `00database`.
81    #[cfg_attr(feature = "tracing", tracing::instrument(skip(self), fields(name = %name)))]
82    pub fn metadata(&mut self, name: &str) -> Result<String, errors::DictError> {
83        let mut query = format!("00-database-{}", name);
84        if !self.metadata.all_chars {
85            query = query.replace(|c: char| !c.is_alphanumeric(), "");
86        }
87        let entries = self.index.find(&query, false);
88        let entry = entries
89            .get(0)
90            .ok_or_else(|| errors::DictError::WordNotFound(name.into()))?;
91        self.content
92            .fetch_definition(entry.offset, entry.size)
93            .map(|def| {
94                let start = def
95                    .find('\n')
96                    .filter(|pos| *pos < def.len() - 1)
97                    .unwrap_or(0);
98                def[start..].trim().to_string()
99            })
100    }
101
102    /// Get the short name.
103    ///
104    /// This returns the short name of a dictionary. This corresponds to the
105    /// value passed to the `-s` option of `dictfmt`.
106    #[cfg_attr(feature = "tracing", tracing::instrument(skip(self)))]
107    pub fn short_name(&mut self) -> Result<String, errors::DictError> {
108        self.metadata("short")
109    }
110
111    /// Get the URL.
112    ///
113    /// This returns the URL of a dictionary. This corresponds to the
114    /// value passed to the `-u` option of `dictfmt`.
115    #[cfg_attr(feature = "tracing", tracing::instrument(skip(self)))]
116    pub fn url(&mut self) -> Result<String, errors::DictError> {
117        self.metadata("url")
118    }
119}
120
121/// Resolves the `dict_id` for a given fingerprint from the database.
122///
123/// Returns `None` if the fingerprint is not found in `dictionary_index_meta`.
124#[cfg_attr(feature = "tracing", tracing::instrument(skip(database), fields(fingerprint = %fingerprint)))]
125pub fn resolve_dict_id(database: &Database, fingerprint: &Fp) -> Option<i64> {
126    let fp_str = fingerprint.to_string();
127    let pool = database.pool().clone();
128
129    crate::db::runtime::RUNTIME
130        .block_on(async {
131            sqlx::query_scalar!(
132                "SELECT dict_id FROM dictionary_index_meta WHERE fingerprint = ?",
133                fp_str
134            )
135            .fetch_optional(&pool)
136            .await
137            .ok()
138            .flatten()
139        })
140        .flatten()
141}
142
143/// Load dictionary using a database-backed index reader.
144///
145/// The content file is read from disk; index lookups are served from the
146/// database. Returns an error if the dictionary has not yet been registered
147/// in `dictionary_index_meta` — this happens when the indexing task has not
148/// run yet for this file. The caller should skip the dictionary in that case
149/// and retry after indexing completes.
150#[cfg_attr(feature = "tracing", tracing::instrument(skip(database), fields(fingerprint = %fingerprint)))]
151pub fn load_dictionary_from_db<P: AsRef<Path> + std::fmt::Debug>(
152    content_path: P,
153    database: &Database,
154    fingerprint: Fp,
155) -> Result<Dictionary, errors::DictError> {
156    let dict_id = match resolve_dict_id(database, &fingerprint) {
157        Some(id) => id,
158        None => {
159            tracing::warn!(fingerprint = %fingerprint, "dictionary not yet indexed, skipping");
160            return Err(errors::DictError::InvalidFileFormat(
161                "dictionary not yet indexed".into(),
162                None,
163            ));
164        }
165    };
166    let content = dictreader::load_dict(content_path)?;
167    let index = Box::new(db_index::DbIndexReader::new(database, Some(dict_id)));
168    Ok(load_dictionary(content, index))
169}
170
171/// Load dictionary from given `DictReader` and `Index`.
172///
173/// A dictionary is made of an index and a dictionary (data). Both are required for look up. This
174/// function allows abstraction from the underlying source by only requiring a
175/// `DictReader` and an [`IndexReader`].
176#[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
177pub fn load_dictionary(content: Box<dyn DictReader>, index: Box<dyn IndexReader>) -> Dictionary {
178    let all_chars = !index.find("00-database-allchars", false).is_empty();
179    let word = if all_chars {
180        "00-database-case-sensitive"
181    } else {
182        "00databasecasesensitive"
183    };
184    let case_sensitive = !index.find(word, false).is_empty();
185    Dictionary {
186        content,
187        index,
188        metadata: Metadata {
189            all_chars,
190            case_sensitive,
191        },
192    }
193}
194
195#[cfg(test)]
196mod tests {
197    use super::*;
198    use crate::db::runtime::RUNTIME;
199
200    const PATH_CASE_SENSITIVE_DICT: &str = "src/dictionary/testdata/case_sensitive_dict.dict";
201    const PATH_CASE_INSENSITIVE_DICT: &str = "src/dictionary/testdata/case_insensitive_dict.dict";
202    type TestEntry = (&'static str, i64, i64, Option<&'static str>);
203
204    const CASE_INSENSITIVE_ENTRIES: &[TestEntry] = &[
205        ("00-database-allchars", 1, 1, None),
206        ("bar", 443, 30, None),
207        ("foo", 428, 15, None),
208        ("straße", 516, 44, None),
209    ];
210
211    const CASE_SENSITIVE_ENTRIES: &[TestEntry] = &[
212        ("00-database-allchars", 1, 1, None),
213        ("00-database-case-sensitive", 2, 1, None),
214        ("Bar", 459, 30, None),
215        ("foo", 444, 15, None),
216        ("straße", 532, 44, None),
217    ];
218
219    fn load_test_dictionary(
220        content_path: &str,
221        entries: &[TestEntry],
222        case_sensitive: bool,
223        all_chars: bool,
224    ) -> Result<Dictionary, errors::DictError> {
225        let db = Database::new(":memory:").expect("in-memory db");
226        db.migrate().expect("migrations");
227
228        let fp = Fp::from_u64(1);
229        let fp_str = fp.to_string();
230
231        RUNTIME.block_on(async {
232            sqlx::query!(
233                r#"INSERT INTO dictionary_index_meta (fingerprint, dict_path, total_lines, indexed_lines, completed)
234                   VALUES (?, ?, ?, 0, 0)"#,
235                fp_str,
236                content_path,
237                0_i64,
238            )
239            .execute(db.pool())
240            .await
241            .expect("insert meta");
242
243            for (word, offset, size, original) in entries {
244                let normalized = apply_transform(word, !all_chars, !case_sensitive);
245                let stored_original = if normalized != *word {
246                    Some(*word)
247                } else {
248                    None
249                };
250                let final_original = original.or(stored_original);
251
252                sqlx::query!(
253                    r#"INSERT OR IGNORE INTO dictionary_index_entry (dict_id, word, offset, size, original)
254                       VALUES (?, ?, ?, ?, ?)"#,
255                    1_i64,
256                    normalized,
257                    offset,
258                    size,
259                    final_original,
260                )
261                .execute(db.pool())
262                .await
263                .expect("insert entry");
264            }
265        });
266
267        load_dictionary_from_db(content_path, &db, fp)
268    }
269
270    fn assert_dict_word_exists(
271        mut dict: Dictionary,
272        headword: &str,
273        definition: &str,
274    ) -> Dictionary {
275        let r = dict.lookup(headword, false);
276        assert!(r.is_ok());
277        let search = r.unwrap();
278        assert_eq!(search.len(), 1);
279        assert!(search[0][1].contains(definition));
280
281        dict
282    }
283
284    #[test]
285    fn test_load_dictionary_from_db() {
286        let r = load_test_dictionary(
287            PATH_CASE_INSENSITIVE_DICT,
288            CASE_INSENSITIVE_ENTRIES,
289            false,
290            true,
291        );
292        assert!(r.is_ok());
293    }
294
295    #[test]
296    fn test_dictionary_lookup_case_insensitive() {
297        let r = load_test_dictionary(
298            PATH_CASE_INSENSITIVE_DICT,
299            CASE_INSENSITIVE_ENTRIES,
300            false,
301            true,
302        );
303        let mut dict = r.unwrap();
304
305        dict = assert_dict_word_exists(dict, "bar", "test for case-sensitivity");
306        dict = assert_dict_word_exists(dict, "Bar", "test for case-sensitivity");
307        assert_dict_word_exists(dict, "straße", "test for non-latin case-sensitivity");
308    }
309
310    #[test]
311    fn test_dictionary_lookup_case_insensitive_fuzzy() {
312        let r = load_test_dictionary(
313            PATH_CASE_INSENSITIVE_DICT,
314            CASE_INSENSITIVE_ENTRIES,
315            false,
316            true,
317        );
318        let mut dict = r.unwrap();
319
320        let r = dict.lookup("ba", true);
321        assert!(r.is_ok());
322        let search = r.unwrap();
323        assert_eq!(search.len(), 1);
324        assert_eq!(search[0][0], "bar");
325        assert!(search[0][1].contains("test for case-sensitivity"));
326    }
327
328    #[test]
329    fn test_dictionary_lookup_case_sensitive() {
330        let r = load_test_dictionary(PATH_CASE_SENSITIVE_DICT, CASE_SENSITIVE_ENTRIES, true, true);
331        let mut dict = r.unwrap();
332
333        dict = assert_dict_word_exists(dict, "Bar", "test for case-sensitivity");
334        dict = assert_dict_word_exists(dict, "straße", "test for non-latin case-sensitivity");
335
336        let r = dict.lookup("bar", false);
337        assert!(r.unwrap().is_empty());
338
339        let r = dict.lookup("strasse", false);
340        assert!(r.unwrap().is_empty());
341    }
342
343    #[test]
344    fn test_dictionary_lookup_case_sensitive_fuzzy() {
345        let r = load_test_dictionary(PATH_CASE_SENSITIVE_DICT, CASE_SENSITIVE_ENTRIES, true, true);
346        let mut dict = r.unwrap();
347
348        let r = dict.lookup("Ba", true);
349        assert!(r.is_ok());
350        let search = r.unwrap();
351        assert_eq!(search.len(), 1);
352        assert_eq!(search[0][0], "Bar");
353        assert!(search[0][1].contains("test for case-sensitivity"));
354    }
355}