1use std::fs::File;
21use std::io::{BufRead, BufReader};
22use std::path::Path;
23
24use levenshtein::levenshtein;
25
26use super::errors::DictError;
27use super::errors::DictError::*;
28use super::Metadata;
29
30pub struct Index<R: BufRead> {
32 pub entries: Vec<Entry>,
33 pub state: Option<R>,
34}
35
36#[derive(Debug, Clone)]
37pub struct Entry {
38 pub headword: String,
39 pub offset: u64,
40 pub size: u64,
41 pub original: Option<String>,
42}
43
44pub trait IndexReader {
45 fn load_and_find(&mut self, headword: &str, fuzzy: bool, metadata: &Metadata) -> Vec<Entry>;
46 fn find(&self, headword: &str, fuzzy: bool) -> Vec<Entry>;
47}
48
49fn normalize(entries: &[Entry], metadata: &Metadata) -> Vec<Entry> {
50 let mut result: Vec<Entry> = Vec::with_capacity(entries.len());
51
52 for entry in entries.iter() {
53 let mut headword = entry.headword.clone();
54
55 if !metadata.all_chars {
56 headword = headword
57 .chars()
58 .filter(|c| c.is_alphanumeric() || c.is_whitespace())
59 .collect();
60 }
61
62 if !metadata.case_sensitive {
63 headword = headword.to_lowercase();
64 }
65
66 let mut i = result.len();
67
68 while i > 0 && headword < result[i - 1].headword {
69 i -= 1;
70 }
71
72 let original = if headword != entry.headword {
73 Some(entry.headword.clone())
74 } else {
75 None
76 };
77
78 result.insert(
79 i,
80 Entry {
81 headword,
82 offset: entry.offset,
83 size: entry.size,
84 original,
85 },
86 );
87 }
88
89 result
90}
91
92impl<R: BufRead> IndexReader for Index<R> {
93 fn load_and_find(&mut self, headword: &str, fuzzy: bool, metadata: &Metadata) -> Vec<Entry> {
94 if let Some(br) = self.state.take() {
95 let has_dictfmt = self.entries.iter().any(|e| e.headword.contains("dictfmt"));
96 if let Ok(mut index) = parse_index(br, false) {
97 self.entries.append(&mut index.entries);
98 if !has_dictfmt {
99 self.entries = normalize(&self.entries, metadata)
100 }
101 }
102 }
103 self.find(headword, fuzzy)
104 }
105
106 fn find(&self, headword: &str, fuzzy: bool) -> Vec<Entry> {
107 if fuzzy {
108 self.entries
109 .iter()
110 .filter(|entry| levenshtein(headword, &entry.headword) <= 1)
111 .cloned()
112 .collect()
113 } else {
114 if let Ok(mut i) = self
115 .entries
116 .binary_search_by_key(&headword, |entry| &entry.headword)
117 {
118 let mut results = vec![self.entries[i].clone()];
119 let j = i;
120 while i > 0 {
121 i -= 1;
122 if self.entries[i].headword != headword {
123 break;
124 }
125 results.insert(0, self.entries[i].clone());
126 }
127 i = j;
128 while i < self.entries.len() - 1 {
129 i += 1;
130 if self.entries[i].headword != headword {
131 break;
132 }
133 results.push(self.entries[i].clone());
134 }
135 results
136 } else {
137 Vec::new()
138 }
139 }
140 }
141}
142
143#[inline]
146fn get_base(input: char) -> Option<u64> {
147 match input {
148 'A'..='Z' => Some((input as u64) - 65), 'a'..='z' => Some((input as u64) - 71), '0'..='9' => Some((input as u64) + 4), '+' => Some(62),
152 '/' => Some(63),
153 _ => None,
154 }
155}
156
157pub fn decode_number(word: &str) -> Result<u64, DictError> {
162 let mut index = 0u64;
163 for (i, character) in word.chars().rev().enumerate() {
164 index += match get_base(character) {
165 Some(x) => x * 64u64.pow(i as u32),
166 None => return Err(InvalidCharacter(character, None, Some(i))),
167 };
168 }
169 Ok(index)
170}
171
172fn parse_line(line: &str, line_number: usize) -> Result<(&str, u64, u64, Option<&str>), DictError> {
174 let mut split = line.split('\t');
176 let headword = split.next().ok_or(MissingColumnInIndex(line_number))?;
177
178 let offset = split.next().ok_or(MissingColumnInIndex(line_number))?;
180 let offset = decode_number(offset)?;
181
182 let size = split.next().ok_or(MissingColumnInIndex(line_number))?;
184 let size = decode_number(size)?;
185
186 let original = split.next();
188
189 Ok((headword, offset, size, original))
190}
191
192pub fn parse_index<B: BufRead>(mut br: B, lazy: bool) -> Result<Index<B>, DictError> {
195 let mut info = false;
196 let mut entries = Vec::new();
197 let mut line_number = 0;
198 let mut line = String::new();
199
200 while let Ok(nb) = br.read_line(&mut line) {
201 if nb == 0 {
202 break;
203 }
204 let (headword, offset, size, original) = parse_line(line.trim_end(), line_number)?;
205
206 entries.push(Entry {
207 headword: headword.to_string(),
208 offset,
209 size,
210 original: original.map(String::from),
211 });
212
213 if lazy {
214 if !info && (headword.starts_with("00-database-") || headword.starts_with("00database"))
215 {
216 info = true;
217 } else if info
218 && !headword.starts_with("00-database-")
219 && !headword.starts_with("00database")
220 {
221 break;
222 }
223 }
224 line_number += 1;
225 line.clear();
226 }
227
228 let state = if lazy { Some(br) } else { None };
229
230 Ok(Index { entries, state })
231}
232
233pub fn parse_index_from_file<P: AsRef<Path>>(
235 path: P,
236 lazy: bool,
237) -> Result<Index<BufReader<File>>, DictError> {
238 let file = File::open(path)?;
239 let reader = BufReader::new(file);
240 parse_index(reader, lazy)
241}
242
243#[cfg(test)]
244mod tests {
245 use super::*;
246 use std::io::Empty;
247
248 const PATH_CASE_SENSITIVE_INDEX: &str = "src/dictionary/testdata/case_sensitive_dict.index";
249 const PATH_CASE_INSENSITIVE_INDEX: &str = "src/dictionary/testdata/case_insensitive_dict.index";
250
251 #[test]
252 fn test_index_find() {
253 let words = vec![
254 Entry {
255 headword: String::from("bar"),
256 offset: 0,
257 size: 8,
258 original: None,
259 },
260 Entry {
261 headword: String::from("baz"),
262 offset: 8,
263 size: 4,
264 original: None,
265 },
266 Entry {
267 headword: String::from("foo"),
268 offset: 12,
269 size: 4,
270 original: None,
271 },
272 ];
273
274 let index: Index<Empty> = Index {
275 entries: words,
276 state: None,
277 };
278
279 let r = index.find("apples", false);
280 assert!(r.is_empty());
281
282 let r = index.find("baz", false);
283 assert!(!r.is_empty());
284 assert_eq!(r.len(), 1);
285 assert_eq!(r.first().unwrap().headword, "baz");
286
287 let r = index.find("bas", true);
288 assert!(!r.is_empty());
289 assert_eq!(r.len(), 2);
290 assert_eq!(r.first().unwrap().headword, "bar");
291 }
292
293 #[test]
294 fn test_index_load_and_find() {
296 let r = parse_index_from_file(PATH_CASE_INSENSITIVE_INDEX, true);
297 assert!(r.is_ok());
298
299 let mut index = r.unwrap();
300 assert_eq!(index.entries[0].headword, "00-database-allchars");
301 assert_eq!(index.entries.last().unwrap().headword, "bar");
302
303 let r = index.load_and_find(
304 "bar",
305 false,
306 &Metadata {
307 all_chars: true,
308 case_sensitive: false,
309 },
310 );
311 assert!(!r.is_empty());
312
313 let r = index.load_and_find(
314 "foo",
315 false,
316 &Metadata {
317 all_chars: true,
318 case_sensitive: false,
319 },
320 );
321 assert!(!r.is_empty());
322 }
323
324 #[test]
325 fn test_parse_index_from_file() {
326 let r = parse_index_from_file(PATH_CASE_INSENSITIVE_INDEX, false);
327 assert!(r.is_ok());
328
329 let index = r.unwrap();
330 assert_eq!(index.entries[0].headword, "00-database-allchars");
331 assert_eq!(index.entries.last().unwrap().headword, "あいおい");
332 }
333
334 #[test]
335 fn test_parse_index_from_file_lazy() {
336 let r = parse_index_from_file(PATH_CASE_INSENSITIVE_INDEX, true);
337 assert!(r.is_ok());
338
339 let index = r.unwrap();
340 assert_eq!(index.entries[0].headword, "00-database-allchars");
341 assert_eq!(index.entries.last().unwrap().headword, "bar");
342 }
343
344 #[test]
345 fn test_parse_index_from_file_handles_case_insensitivity() {
346 let r = parse_index_from_file(PATH_CASE_INSENSITIVE_INDEX, false);
347 assert!(r.is_ok());
348
349 let index = r.unwrap();
350
351 let r = index.find("bar", false);
352 assert!(!r.is_empty());
353 assert_eq!(r.first().unwrap().headword, "bar");
354 }
355
356 #[test]
357 fn test_parse_index_from_file_handles_case_sensitivity() {
358 let r = parse_index_from_file(PATH_CASE_SENSITIVE_INDEX, false);
359 assert!(r.is_ok());
360
361 let index = r.unwrap();
362
363 let r = index.find("Bar", false);
364 assert!(!r.is_empty());
365 assert_eq!(r.first().unwrap().headword, "Bar");
366 }
367}