cadmus_core/dictionary/
dictreader.rs

1//! Open and read .dict or .dict.dz files.
2//!
3//! This module contains traits and structs to work with uncompressed .dict and compressed .dict.dz
4//! files. These files contain the actual dictionary content. While these readers return the
5//! definitions, they do not do any post-processing. Definitions are normally plain text, but they
6//! could be HTML, or anything else, in theory (although plain text is the de facto default).
7//!
8//! To understand some of the constants defined in this module or to understand the internals of
9//! the DictReaderDz struct, it is advisable to have a brief look at
10//! [the GZip standard](https://tools.ietf.org/html/rfc1952).
11
12use std::ffi::OsStr;
13use std::fs::File;
14use std::io;
15use std::io::{BufRead, BufReader, Read, Seek, SeekFrom};
16use std::path::Path;
17
18use super::errors::DictError;
19use byteorder::*;
20
21/// Limit size of a word buffer, so that malicious index files cannot request too much memory for a
22/// translation.
23pub static MAX_BYTES_FOR_BUFFER: u64 = 1_048_576; // No headword definition is larger than 1M.
24
25/// Byte mask to query for existence of FEXTRA field in the flags byte of a `.dz` file.
26pub static GZ_FEXTRA: u8 = 0b0000_0100;
27/// Byte mask to query for the existence of a file name in a `.dz` file.
28pub static GZ_FNAME: u8 = 0b0000_1000; // Indicates whether a file name is contained in the archive.
29/// Byte mask to query for the existence of a comment in a `.dz` file.
30pub static GZ_COMMENT: u8 = 0b0001_0000; // Indicates, whether a comment is present.
31/// Byte mask to detect that a comment is contained in a `.dz` file.
32pub static GZ_FHCRC: u8 = 0b0000_0010;
33
34/// A dictionary (content) reader.
35///
36/// This type abstracts from the underlying seek operations required for lookup
37/// of headwords and provides easy methods to search for a word given a certain
38/// offset and length. Users of a type which implements this trait don't need to care about compression
39/// of the dictionary.
40pub trait DictReader {
41    /// Fetch the definition from the dictionary at offset and length.
42    fn fetch_definition(&mut self, start_offset: u64, length: u64) -> Result<String, DictError>;
43}
44
45/// Raw Dict reader.
46///
47/// This reader can read uncompressed .dict files.
48pub struct DictReaderRaw<B: Read + Seek> {
49    dict_data: B,
50    total_length: u64,
51}
52
53impl<B: Read + Seek> DictReaderRaw<B> {
54    /// Get a new DictReader from a Reader.
55    pub fn new(mut dict_data: B) -> Result<DictReaderRaw<B>, DictError> {
56        let end = dict_data.seek(SeekFrom::End(0))?;
57        Ok(DictReaderRaw {
58            dict_data,
59            total_length: end,
60        })
61    }
62}
63
64impl<B: Read + Seek> DictReader for DictReaderRaw<B> {
65    /// Fetch definition from dictionary.
66    fn fetch_definition(&mut self, start_offset: u64, length: u64) -> Result<String, DictError> {
67        if length > MAX_BYTES_FOR_BUFFER {
68            return Err(DictError::MemoryError);
69        }
70
71        if (start_offset + length) > self.total_length {
72            return Err(DictError::IoError(io::Error::new(
73                io::ErrorKind::UnexpectedEof,
74                "a \
75                      seek beyond the end of uncompressed data was requested",
76            )));
77        }
78
79        self.dict_data.seek(SeekFrom::Start(start_offset))?;
80        let mut read_data = vec![0; length as usize];
81        let bytes_read = self.dict_data.read(read_data.as_mut_slice())? as u64;
82        if bytes_read != length {
83            // reading from end of file?
84            return Err(DictError::IoError(io::Error::new(
85                io::ErrorKind::UnexpectedEof,
86                "seek beyond end of file",
87            )));
88        }
89        Ok(String::from_utf8(read_data)?)
90    }
91}
92
93/// Load a `DictReader` from file.
94///
95/// This function loads a `Dictreader` from a file and transparently selects
96/// the correct reader using the file type extension, so the callee doesn't need to care about
97/// compression (`.dz`).
98///
99/// # Errors
100///
101/// The function can return a `DictError`, which can either occur if a I/O error occurs, or when
102/// the GZ compressed file is invalid.
103pub fn load_dict<P: AsRef<Path>>(path: P) -> Result<Box<dyn DictReader>, DictError> {
104    if path.as_ref().extension() == Some(OsStr::new("dz")) {
105        let reader = File::open(path)?;
106        Ok(Box::new(DictReaderDz::new(reader)?))
107    } else {
108        let reader = BufReader::new(File::open(path)?);
109        Ok(Box::new(DictReaderRaw::new(reader)?))
110    }
111}
112
113/// Gzip Dict reader
114///
115/// This reader can read compressed .dict files with the file name suffix .dz.
116/// This format is documented in RFC 1952 and in `man dictzip`. An example implementation can be
117/// found in the dict daemon (dictd) in `data.c`.
118pub struct DictReaderDz<B: Read + Seek> {
119    /// Compressed DZ dictionary.
120    dzdict: B,
121    /// Length of an uncompressed chunk.
122    uchunk_length: usize,
123    /// End of compressed data.
124    end_compressed_data: usize,
125    /// Offsets in file where a new compressed chunk starts.
126    chunk_offsets: Vec<usize>,
127    /// Total size of uncompressed file.
128    ufile_length: u64, // Has u64 to be quicker in comparing to offsets.
129}
130
131#[derive(Debug)]
132// A (GZ) chunk, representing length and offset withing the compressed file.
133struct Chunk {
134    offset: usize,
135    length: usize,
136}
137
138impl<B: Read + Seek> DictReaderDz<B> {
139    /// Get a new DictReader from a Reader.
140    pub fn new(dzdict: B) -> Result<DictReaderDz<B>, DictError> {
141        let mut buffered_dzdict = BufReader::new(dzdict);
142        let mut header = vec![0u8; 12];
143        buffered_dzdict.read_exact(&mut header)?;
144        if header[0..2] != [0x1F, 0x8B] {
145            return Err(DictError::InvalidFileFormat(
146                "Not in gzip format".into(),
147                None,
148            ));
149        }
150
151        let flags = &header[3]; // Bitmap of gzip attributes.
152        if (flags & GZ_FEXTRA) == 0 {
153            // Check whether FLG.FEXTRA is set.
154            return Err(DictError::InvalidFileFormat(
155                "Extra flag (FLG.FEXTRA) \
156                       not set, not in gzip + dzip format"
157                    .into(),
158                None,
159            ));
160        }
161
162        // Read XLEN, length of extra FEXTRA field.
163        let xlen = LittleEndian::read_u16(&header[10..12]);
164
165        // Read FEXTRA data.
166        let mut fextra = vec![0u8; xlen as usize];
167        buffered_dzdict.read_exact(&mut fextra)?;
168
169        if fextra[0..2] != [b'R', b'A'] {
170            return Err(DictError::InvalidFileFormat(
171                "No dictzip info found in FEXTRA \
172                    header (behind XLEN, in SI1SI2 fields)"
173                    .into(),
174                None,
175            ));
176        }
177
178        let length_subfield = LittleEndian::read_u16(&fextra[2..4]);
179        assert_eq!(
180            length_subfield,
181            xlen - 4,
182            "the length of the subfield \
183                   should be the same as the fextra field, ignoring the \
184                   additional length information and the file format identification"
185        );
186        let subf_version = LittleEndian::read_u16(&fextra[4..6]);
187        if subf_version != 1 {
188            return Err(DictError::InvalidFileFormat(
189                "Unimplemented dictzip \
190                     version, only ver 1 supported"
191                    .into(),
192                None,
193            ));
194        }
195
196        // Before compression, the file is split into evenly-sized chunks and the size information
197        // is put right after the version information:
198        let uchunk_length = LittleEndian::read_u16(&fextra[6..8]);
199        // Number of chunks in the file.
200        let chunk_count = LittleEndian::read_u16(&fextra[8..10]);
201        if chunk_count == 0 {
202            return Err(DictError::InvalidFileFormat(
203                "No compressed chunks in \
204                    file or broken header information"
205                    .into(),
206                None,
207            ));
208        }
209
210        // Compute number of possible chunks which would fit into the FEXTRA field; used for
211        // validity check. The first 10 bytes of FEXTRA are header information, the rest are 2-byte,
212        // little-endian numbers.
213        let numbers_chunks_which_would_fit = ((fextra.len() - 10) / 2) as u16; // each chunk represented by u16 == 2 bytes
214                                                                               // Check that number of claimed chunks fits within given size for subfield.
215        if numbers_chunks_which_would_fit != chunk_count {
216            return Err(DictError::InvalidFileFormat(
217                format!(
218                    "Expected {} chunks \
219                      according to dictzip header, but the FEXTRA field can \
220                      accomodate {}; possibly broken file",
221                    chunk_count, numbers_chunks_which_would_fit
222                ),
223                None,
224            ));
225        }
226
227        // If file name bit set, seek beyond the 0-terminated file name, we don't care.
228        if (flags & GZ_FNAME) != 0 {
229            let mut tmp = Vec::new();
230            buffered_dzdict.read_until(b'\0', &mut tmp)?;
231        }
232
233        // Seek past comment, if any.
234        if (flags & GZ_COMMENT) != 0 {
235            let mut tmp = Vec::new();
236            buffered_dzdict.read_until(b'\0', &mut tmp)?;
237        }
238
239        // Skip CRC stuff, 2 bytes.
240        if (flags & GZ_FHCRC) != 0 {
241            buffered_dzdict.seek(SeekFrom::Current(2))?;
242        }
243
244        // Save length of each compressed chunk.
245        let mut chunk_offsets = Vec::with_capacity(chunk_count as usize);
246        // Save position of last compressed byte (this is NOT EOF, could be followed by CRC checksum).
247        let mut end_compressed_data = buffered_dzdict.seek(SeekFrom::Current(0))? as usize;
248        // After the various header bytes parsed above, the list of chunk lengths can be found (slice for easier indexing).
249        let chunks_from_header = &fextra[10usize..(10 + chunk_count * 2) as usize];
250
251        // Iterate over each 2nd byte, parse u16.
252        for index in (0..chunks_from_header.len()).filter(|i| (i % 2) == 0) {
253            let index = index as usize;
254            let compressed_len =
255                LittleEndian::read_u16(&chunks_from_header[index..(index + 2)]) as usize;
256            chunk_offsets.push(end_compressed_data);
257            end_compressed_data += compressed_len;
258        }
259        assert_eq!(chunk_offsets.len() as u16, chunk_count, "The read number of compressed chunks in \
260                the .dz file must be equivalent to the number of chunks actually found in the file.\n");
261
262        // Read uncompressed file length.
263        buffered_dzdict.seek(SeekFrom::Start(end_compressed_data as u64))?;
264        let uncompressed = buffered_dzdict.read_i32::<LittleEndian>()?;
265
266        Ok(DictReaderDz {
267            dzdict: buffered_dzdict.into_inner(),
268            chunk_offsets,
269            end_compressed_data,
270            uchunk_length: uchunk_length as usize,
271            ufile_length: uncompressed as u64,
272        })
273    }
274
275    fn get_chunks_for(&self, start_offset: u64, length: u64) -> Vec<Chunk> {
276        let mut chunks = Vec::new();
277        let start_chunk = start_offset as usize / self.uchunk_length;
278        let end_chunk = (start_offset + length) as usize / self.uchunk_length;
279        for id in start_chunk..=end_chunk {
280            let chunk_length = match self.chunk_offsets.get(id + 1) {
281                Some(next) => next - self.chunk_offsets[id],
282                None => self.end_compressed_data - self.chunk_offsets[id],
283            };
284            chunks.push(Chunk {
285                offset: self.chunk_offsets[id],
286                length: chunk_length,
287            });
288        }
289
290        chunks
291    }
292
293    // Inflate a dictdz chunk.
294    fn inflate(&self, data: Vec<u8>) -> Result<Vec<u8>, DictError> {
295        let mut decoder = flate2::Decompress::new(false);
296        let mut decoded = vec![0u8; self.uchunk_length];
297        decoder.decompress(
298            data.as_slice(),
299            decoded.as_mut_slice(),
300            flate2::FlushDecompress::None,
301        )?;
302        Ok(decoded)
303    }
304}
305
306impl<B: Read + Seek> DictReader for DictReaderDz<B> {
307    // Fetch definition from the dictionary.
308    fn fetch_definition(&mut self, start_offset: u64, length: u64) -> Result<String, DictError> {
309        if length > MAX_BYTES_FOR_BUFFER {
310            return Err(DictError::MemoryError);
311        }
312        if (start_offset + length) > self.ufile_length {
313            return Err(DictError::IoError(io::Error::new(
314                io::ErrorKind::UnexpectedEof,
315                "a \
316                      seek beyond the end of uncompressed data was requested",
317            )));
318        }
319        let mut data = Vec::new();
320        for chunk in self.get_chunks_for(start_offset, length) {
321            let pos = self.dzdict.seek(SeekFrom::Start(chunk.offset as u64))?;
322            if pos != (chunk.offset as u64) {
323                return Err(DictError::IoError(io::Error::new(
324                    io::ErrorKind::Other,
325                    format!(
326                        "attempted to seek to {} but new position is {}",
327                        chunk.offset, pos
328                    ),
329                )));
330            }
331            let mut definition = vec![0u8; chunk.length];
332            self.dzdict.read_exact(&mut definition)?;
333            data.push(self.inflate(definition)?);
334        }
335
336        // Cut definition, convert to string.
337        let cut_front = start_offset as usize % self.uchunk_length;
338        // Join the chunks to one vector, only keeping the content of the definition.
339        let data = match data.len() {
340            0 => panic!(),
341            1 => data[0][cut_front..cut_front + length as usize].to_vec(),
342            n => {
343                let mut tmp = data[0][cut_front..].to_vec();
344                // First vec has been inserted into tmp, therefore skip first and last chunk, too.
345                for text in data.iter().skip(1).take(n - 2) {
346                    tmp.extend_from_slice(text);
347                }
348                // Add last chunk to tmp, omitting stuff after word definition end.
349                let remaining_bytes = (length as usize + cut_front) % self.uchunk_length;
350                tmp.extend_from_slice(&data[n - 1][..remaining_bytes]);
351                tmp
352            }
353        };
354        Ok(String::from_utf8(data)?)
355    }
356}