Skip to main content

cadmus_core/dictionary/
db_index.rs

1//! SQLite-backed dictionary index reader.
2//!
3//! Replaces the in-memory `.index` file reader with a database-backed implementation
4//! that supports both single-dictionary and cross-dictionary word lookups.
5
6use levenshtein::levenshtein;
7use sqlx::SqlitePool;
8
9use crate::db::runtime::RUNTIME;
10use crate::db::Database;
11
12use super::indexing::{Entry, IndexReader};
13use super::Metadata;
14
15/// Escapes SQLite LIKE wildcards (`%`, `_`) and the escape character (`\`)
16/// so a user-supplied prefix is matched literally.
17fn escape_like_prefix(prefix: &str) -> String {
18    prefix
19        .replace('\\', "\\\\")
20        .replace('%', "\\%")
21        .replace('_', "\\_")
22}
23
24/// SQLite-backed implementation of [`IndexReader`].
25///
26/// When `dict_id` is `Some`, queries are scoped to that dictionary.
27/// When `None`, queries search across all indexed dictionaries.
28pub struct DbIndexReader {
29    pool: SqlitePool,
30    dict_id: Option<i64>,
31}
32
33impl DbIndexReader {
34    /// Creates a new reader backed by `database`, optionally scoped to `dict_id`.
35    pub fn new(database: &Database, dict_id: Option<i64>) -> Self {
36        Self {
37            pool: database.pool().clone(),
38            dict_id,
39        }
40    }
41
42    #[cfg_attr(feature = "tracing", tracing::instrument(skip(self), fields(headword = %headword)))]
43    async fn exact_scoped(&self, headword: &str, id: i64) -> Vec<Entry> {
44        match sqlx::query!(
45            r#"SELECT word, offset, size, original
46               FROM dictionary_index_entry
47               WHERE dict_id = ? AND word = ?"#,
48            id,
49            headword,
50        )
51        .fetch_all(&self.pool)
52        .await
53        {
54            Ok(rows) => rows
55                .into_iter()
56                .map(|r| Entry {
57                    headword: r.word,
58                    offset: r.offset as u64,
59                    size: r.size as u64,
60                    original: r.original,
61                })
62                .collect(),
63            Err(e) => {
64                tracing::error!(error = %e, "exact scoped dictionary index query failed");
65                Vec::new()
66            }
67        }
68    }
69
70    #[cfg_attr(feature = "tracing", tracing::instrument(skip(self), fields(headword = %headword)))]
71    async fn exact_global(&self, headword: &str) -> Vec<Entry> {
72        match sqlx::query!(
73            r#"SELECT word, offset, size, original
74               FROM dictionary_index_entry
75               WHERE word = ?"#,
76            headword,
77        )
78        .fetch_all(&self.pool)
79        .await
80        {
81            Ok(rows) => rows
82                .into_iter()
83                .map(|r| Entry {
84                    headword: r.word,
85                    offset: r.offset as u64,
86                    size: r.size as u64,
87                    original: r.original,
88                })
89                .collect(),
90            Err(e) => {
91                tracing::error!(error = %e, "exact global dictionary index query failed");
92                Vec::new()
93            }
94        }
95    }
96
97    #[cfg_attr(feature = "tracing", tracing::instrument(skip(self), fields(headword = %headword, prefix = %prefix)))]
98    async fn fuzzy_scoped(&self, headword: &str, prefix: &str, id: i64) -> Vec<Entry> {
99        match sqlx::query!(
100            r#"SELECT word, offset, size, original
101               FROM dictionary_index_entry
102               WHERE dict_id = ? AND word LIKE ? || '%' ESCAPE '\'"#,
103            id,
104            prefix,
105        )
106        .fetch_all(&self.pool)
107        .await
108        {
109            Ok(rows) => rows
110                .into_iter()
111                .filter(|r| levenshtein(headword, &r.word) <= 1)
112                .map(|r| Entry {
113                    headword: r.word,
114                    offset: r.offset as u64,
115                    size: r.size as u64,
116                    original: r.original,
117                })
118                .collect(),
119            Err(e) => {
120                tracing::error!(error = %e, "fuzzy scoped dictionary index query failed");
121                Vec::new()
122            }
123        }
124    }
125
126    #[cfg_attr(feature = "tracing", tracing::instrument(skip(self), fields(headword = %headword, prefix = %prefix)))]
127    async fn fuzzy_global(&self, headword: &str, prefix: &str) -> Vec<Entry> {
128        match sqlx::query!(
129            r#"SELECT word, offset, size, original
130               FROM dictionary_index_entry
131               WHERE word LIKE ? || '%' ESCAPE '\'"#,
132            prefix,
133        )
134        .fetch_all(&self.pool)
135        .await
136        {
137            Ok(rows) => rows
138                .into_iter()
139                .filter(|r| levenshtein(headword, &r.word) <= 1)
140                .map(|r| Entry {
141                    headword: r.word,
142                    offset: r.offset as u64,
143                    size: r.size as u64,
144                    original: r.original,
145                })
146                .collect(),
147            Err(e) => {
148                tracing::error!(error = %e, "fuzzy global dictionary index query failed");
149                Vec::new()
150            }
151        }
152    }
153
154    #[cfg_attr(feature = "tracing", tracing::instrument(skip(self), fields(headword = %headword )))]
155    fn query_exact(&self, headword: &str) -> Vec<Entry> {
156        let headword = headword.to_string();
157
158        RUNTIME.block_on(async {
159            if let Some(id) = self.dict_id {
160                self.exact_scoped(&headword, id).await
161            } else {
162                self.exact_global(&headword).await
163            }
164        })
165    }
166
167    #[cfg_attr(feature = "tracing", tracing::instrument(skip(self), fields(headword = %headword )))]
168    fn query_fuzzy(&self, headword: &str) -> Vec<Entry> {
169        let prefix_len = headword
170            .char_indices()
171            .nth(3)
172            .map(|(i, _)| i)
173            .unwrap_or(headword.len());
174        let prefix = escape_like_prefix(&headword[..prefix_len]);
175        let headword = headword.to_string();
176
177        RUNTIME.block_on(async {
178            if let Some(id) = self.dict_id {
179                self.fuzzy_scoped(&headword, &prefix, id).await
180            } else {
181                self.fuzzy_global(&headword, &prefix).await
182            }
183        })
184    }
185}
186
187impl IndexReader for DbIndexReader {
188    #[cfg_attr(feature = "tracing", tracing::instrument(skip(self, _metadata), fields(headword = %headword, fuzzy)))]
189    fn load_and_find(&mut self, headword: &str, fuzzy: bool, _metadata: &Metadata) -> Vec<Entry> {
190        self.find(headword, fuzzy)
191    }
192
193    #[cfg_attr(feature = "tracing", tracing::instrument(skip(self), fields(headword = %headword, fuzzy)))]
194    fn find(&self, headword: &str, fuzzy: bool) -> Vec<Entry> {
195        if fuzzy {
196            self.query_fuzzy(headword)
197        } else {
198            self.query_exact(headword)
199        }
200    }
201}
202
203#[cfg(test)]
204mod tests {
205    use super::*;
206    use crate::db::runtime::RUNTIME;
207
208    fn setup_db() -> Database {
209        let db = Database::new(":memory:").expect("in-memory db");
210        db.migrate().expect("migrations");
211        db
212    }
213
214    fn insert_meta(pool: &SqlitePool, dict_id: i64, fp: &str) {
215        RUNTIME.block_on(async {
216            sqlx::query!(
217                "INSERT OR IGNORE INTO dictionary_index_meta (dict_id, fingerprint, dict_path, total_lines, indexed_lines, completed) VALUES (?, ?, ?, 0, 0, 1)",
218                dict_id,
219                fp,
220                fp,
221            )
222            .execute(pool)
223            .await
224            .expect("insert meta");
225        });
226    }
227
228    fn insert_entry(
229        pool: &SqlitePool,
230        dict_id: i64,
231        fp: &str,
232        word: &str,
233        offset: i64,
234        size: i64,
235        original: Option<&str>,
236    ) {
237        insert_meta(pool, dict_id, fp);
238        RUNTIME.block_on(async {
239            sqlx::query!(
240                "INSERT INTO dictionary_index_entry (dict_id, word, offset, size, original) VALUES (?, ?, ?, ?, ?)",
241                dict_id,
242                word,
243                offset,
244                size,
245                original,
246            )
247            .execute(pool)
248            .await
249            .expect("insert entry");
250        });
251    }
252
253    const DICT_ID_1: i64 = 1;
254    const DICT_ID_2: i64 = 2;
255
256    #[test]
257    fn test_exact_lookup_with_dict_id() {
258        let db = setup_db();
259        insert_entry(db.pool(), DICT_ID_1, "fp1", "hello", 0, 10, None);
260        insert_entry(db.pool(), DICT_ID_2, "fp2", "world", 10, 5, None);
261
262        let reader = DbIndexReader::new(&db, Some(DICT_ID_1));
263        let results = reader.find("hello", false);
264        assert_eq!(results.len(), 1);
265        assert_eq!(results[0].headword, "hello");
266        assert_eq!(results[0].offset, 0);
267        assert_eq!(results[0].size, 10);
268    }
269
270    #[test]
271    fn test_exact_lookup_scoped_dict_id_excludes_other() {
272        let db = setup_db();
273        insert_entry(db.pool(), DICT_ID_1, "fp1", "hello", 0, 10, None);
274        insert_entry(db.pool(), DICT_ID_2, "fp2", "hello", 20, 8, None);
275
276        let reader = DbIndexReader::new(&db, Some(DICT_ID_1));
277        let results = reader.find("hello", false);
278        assert_eq!(results.len(), 1);
279        assert_eq!(results[0].offset, 0);
280    }
281
282    #[test]
283    fn test_exact_lookup_no_dict_id_finds_all() {
284        let db = setup_db();
285        insert_entry(db.pool(), DICT_ID_1, "fp1", "hello", 0, 10, None);
286        insert_entry(db.pool(), DICT_ID_2, "fp2", "hello", 20, 8, None);
287
288        let reader = DbIndexReader::new(&db, None);
289        let results = reader.find("hello", false);
290        assert_eq!(results.len(), 2);
291    }
292
293    #[test]
294    fn test_exact_lookup_no_match() {
295        let db = setup_db();
296        insert_entry(db.pool(), DICT_ID_1, "fp1", "hello", 0, 10, None);
297
298        let reader = DbIndexReader::new(&db, Some(DICT_ID_1));
299        let results = reader.find("world", false);
300        assert!(results.is_empty());
301    }
302
303    #[test]
304    fn test_fuzzy_lookup_with_dict_id() {
305        let db = setup_db();
306        insert_entry(db.pool(), DICT_ID_1, "fp1", "hello", 0, 10, None);
307        insert_entry(db.pool(), DICT_ID_1, "fp1", "helo", 10, 5, None);
308        insert_entry(db.pool(), DICT_ID_1, "fp1", "world", 15, 5, None);
309
310        let reader = DbIndexReader::new(&db, Some(DICT_ID_1));
311        let results = reader.find("hello", true);
312        assert_eq!(results.len(), 2);
313        let words: Vec<&str> = results.iter().map(|e| e.headword.as_str()).collect();
314        assert!(words.contains(&"hello"));
315        assert!(words.contains(&"helo"));
316    }
317
318    #[test]
319    fn test_fuzzy_lookup_no_dict_id_cross_dict() {
320        let db = setup_db();
321        insert_entry(db.pool(), DICT_ID_1, "fp1", "hello", 0, 10, None);
322        insert_entry(db.pool(), DICT_ID_2, "fp2", "helo", 10, 5, None);
323
324        let reader = DbIndexReader::new(&db, None);
325        let results = reader.find("hello", true);
326        assert_eq!(results.len(), 2);
327    }
328
329    #[test]
330    fn test_load_and_find_delegates_to_find() {
331        let db = setup_db();
332        insert_entry(db.pool(), DICT_ID_1, "fp1", "hello", 0, 10, None);
333
334        let mut reader = DbIndexReader::new(&db, Some(DICT_ID_1));
335        let metadata = Metadata {
336            all_chars: true,
337            case_sensitive: false,
338        };
339        let results = reader.load_and_find("hello", false, &metadata);
340        assert_eq!(results.len(), 1);
341        assert_eq!(results[0].headword, "hello");
342    }
343
344    #[test]
345    fn test_original_field_preserved() {
346        let db = setup_db();
347        insert_entry(db.pool(), DICT_ID_1, "fp1", "hello", 0, 10, Some("Hello"));
348
349        let reader = DbIndexReader::new(&db, Some(DICT_ID_1));
350        let results = reader.find("hello", false);
351        assert_eq!(results.len(), 1);
352        assert_eq!(results[0].original.as_deref(), Some("Hello"));
353    }
354
355    #[test]
356    fn test_multiple_definitions_same_word_all_returned() {
357        let db = setup_db();
358        insert_entry(db.pool(), DICT_ID_1, "fp1", "pain", 100, 20, Some("Pain"));
359        insert_entry(db.pool(), DICT_ID_1, "fp1", "pain", 200, 30, Some("PAIN"));
360        insert_entry(db.pool(), DICT_ID_1, "fp1", "pain", 300, 40, None);
361
362        let reader = DbIndexReader::new(&db, Some(DICT_ID_1));
363        let results = reader.find("pain", false);
364        assert_eq!(results.len(), 3);
365        let offsets: Vec<u64> = results.iter().map(|e| e.offset).collect();
366        assert!(offsets.contains(&100));
367        assert!(offsets.contains(&200));
368        assert!(offsets.contains(&300));
369    }
370}