Skip to main content

cadmus_core/dictionary/
dictreader.rs

1//! Open and read .dict or .dict.dz files.
2//!
3//! This module contains traits and structs to work with uncompressed .dict and compressed .dict.dz
4//! files. These files contain the actual dictionary content. While these readers return the
5//! definitions, they do not do any post-processing. Definitions are normally plain text, but they
6//! could be HTML, or anything else, in theory (although plain text is the de facto default).
7//!
8//! To understand some of the constants defined in this module or to understand the internals of
9//! the DictReaderDz struct, it is advisable to have a brief look at
10//! [the GZip standard](https://tools.ietf.org/html/rfc1952).
11
12use std::ffi::OsStr;
13use std::fs::File;
14use std::io;
15use std::io::{BufRead, BufReader, Read, Seek, SeekFrom};
16use std::path::Path;
17
18use super::errors::DictError;
19use byteorder::*;
20
21/// Limit size of a word buffer, so that malicious index files cannot request too much memory for a
22/// translation.
23pub static MAX_BYTES_FOR_BUFFER: u64 = 1_048_576; // No headword definition is larger than 1M.
24
25/// Byte mask to query for existence of FEXTRA field in the flags byte of a `.dz` file.
26pub static GZ_FEXTRA: u8 = 0b0000_0100;
27/// Byte mask to query for the existence of a file name in a `.dz` file.
28pub static GZ_FNAME: u8 = 0b0000_1000; // Indicates whether a file name is contained in the archive.
29/// Byte mask to query for the existence of a comment in a `.dz` file.
30pub static GZ_COMMENT: u8 = 0b0001_0000; // Indicates, whether a comment is present.
31/// Byte mask to detect that a comment is contained in a `.dz` file.
32pub static GZ_FHCRC: u8 = 0b0000_0010;
33
34/// A dictionary (content) reader.
35///
36/// This type abstracts from the underlying seek operations required for lookup
37/// of headwords and provides easy methods to search for a word given a certain
38/// offset and length. Users of a type which implements this trait don't need to care about compression
39/// of the dictionary.
40pub trait DictReader {
41    /// Fetch the definition from the dictionary at offset and length.
42    fn fetch_definition(&mut self, start_offset: u64, length: u64) -> Result<String, DictError>;
43}
44
45/// Raw Dict reader.
46///
47/// This reader can read uncompressed .dict files.
48pub struct DictReaderRaw<B: Read + Seek> {
49    dict_data: B,
50    total_length: u64,
51}
52
53impl<B: Read + Seek> DictReaderRaw<B> {
54    /// Get a new DictReader from a Reader.
55    #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
56    pub fn new(mut dict_data: B) -> Result<DictReaderRaw<B>, DictError> {
57        let end = dict_data.seek(SeekFrom::End(0))?;
58        Ok(DictReaderRaw {
59            dict_data,
60            total_length: end,
61        })
62    }
63}
64
65impl<B: Read + Seek> DictReader for DictReaderRaw<B> {
66    /// Fetch definition from dictionary.
67    #[cfg_attr(
68        feature = "tracing",
69        tracing::instrument(skip(self), fields(start_offset, length))
70    )]
71    fn fetch_definition(&mut self, start_offset: u64, length: u64) -> Result<String, DictError> {
72        if length > MAX_BYTES_FOR_BUFFER {
73            return Err(DictError::MemoryError);
74        }
75
76        if (start_offset + length) > self.total_length {
77            return Err(DictError::IoError(io::Error::new(
78                io::ErrorKind::UnexpectedEof,
79                "a \
80                      seek beyond the end of uncompressed data was requested",
81            )));
82        }
83
84        self.dict_data.seek(SeekFrom::Start(start_offset))?;
85        let mut read_data = vec![0; length as usize];
86        let bytes_read = self.dict_data.read(read_data.as_mut_slice())? as u64;
87        if bytes_read != length {
88            // reading from end of file?
89            return Err(DictError::IoError(io::Error::new(
90                io::ErrorKind::UnexpectedEof,
91                "seek beyond end of file",
92            )));
93        }
94        Ok(String::from_utf8(read_data)?)
95    }
96}
97
98/// Load a `DictReader` from file.
99///
100/// This function loads a `Dictreader` from a file and transparently selects
101/// the correct reader using the file type extension, so the callee doesn't need to care about
102/// compression (`.dz`).
103///
104/// # Errors
105///
106/// The function can return a `DictError`, which can either occur if a I/O error occurs, or when
107/// the GZ compressed file is invalid.
108#[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
109pub fn load_dict<P: AsRef<Path>>(path: P) -> Result<Box<dyn DictReader>, DictError> {
110    if path.as_ref().extension() == Some(OsStr::new("dz")) {
111        let reader = File::open(path)?;
112        Ok(Box::new(DictReaderDz::new(reader)?))
113    } else {
114        let reader = BufReader::new(File::open(path)?);
115        Ok(Box::new(DictReaderRaw::new(reader)?))
116    }
117}
118
119/// Gzip Dict reader
120///
121/// This reader can read compressed .dict files with the file name suffix .dz.
122/// This format is documented in RFC 1952 and in `man dictzip`. An example implementation can be
123/// found in the dict daemon (dictd) in `data.c`.
124pub struct DictReaderDz<B: Read + Seek> {
125    /// Compressed DZ dictionary.
126    dzdict: B,
127    /// Length of an uncompressed chunk.
128    uchunk_length: usize,
129    /// End of compressed data.
130    end_compressed_data: usize,
131    /// Offsets in file where a new compressed chunk starts.
132    chunk_offsets: Vec<usize>,
133    /// Total size of uncompressed file.
134    ufile_length: u64, // Has u64 to be quicker in comparing to offsets.
135}
136
137#[derive(Debug)]
138// A (GZ) chunk, representing length and offset withing the compressed file.
139struct Chunk {
140    offset: usize,
141    length: usize,
142}
143
144impl<B: Read + Seek> DictReaderDz<B> {
145    /// Get a new DictReader from a Reader.
146    #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
147    pub fn new(dzdict: B) -> Result<DictReaderDz<B>, DictError> {
148        let mut buffered_dzdict = BufReader::new(dzdict);
149        let mut header = vec![0u8; 12];
150        buffered_dzdict.read_exact(&mut header)?;
151        if header[0..2] != [0x1F, 0x8B] {
152            return Err(DictError::InvalidFileFormat(
153                "Not in gzip format".into(),
154                None,
155            ));
156        }
157
158        let flags = &header[3]; // Bitmap of gzip attributes.
159        if (flags & GZ_FEXTRA) == 0 {
160            // Check whether FLG.FEXTRA is set.
161            return Err(DictError::InvalidFileFormat(
162                "Extra flag (FLG.FEXTRA) \
163                       not set, not in gzip + dzip format"
164                    .into(),
165                None,
166            ));
167        }
168
169        // Read XLEN, length of extra FEXTRA field.
170        let xlen = LittleEndian::read_u16(&header[10..12]);
171
172        // Read FEXTRA data.
173        let mut fextra = vec![0u8; xlen as usize];
174        buffered_dzdict.read_exact(&mut fextra)?;
175
176        if fextra[0..2] != [b'R', b'A'] {
177            return Err(DictError::InvalidFileFormat(
178                "No dictzip info found in FEXTRA \
179                    header (behind XLEN, in SI1SI2 fields)"
180                    .into(),
181                None,
182            ));
183        }
184
185        let length_subfield = LittleEndian::read_u16(&fextra[2..4]);
186        assert_eq!(
187            length_subfield,
188            xlen - 4,
189            "the length of the subfield \
190                   should be the same as the fextra field, ignoring the \
191                   additional length information and the file format identification"
192        );
193        let subf_version = LittleEndian::read_u16(&fextra[4..6]);
194        if subf_version != 1 {
195            return Err(DictError::InvalidFileFormat(
196                "Unimplemented dictzip \
197                     version, only ver 1 supported"
198                    .into(),
199                None,
200            ));
201        }
202
203        // Before compression, the file is split into evenly-sized chunks and the size information
204        // is put right after the version information:
205        let uchunk_length = LittleEndian::read_u16(&fextra[6..8]);
206        // Number of chunks in the file.
207        let chunk_count = LittleEndian::read_u16(&fextra[8..10]);
208        if chunk_count == 0 {
209            return Err(DictError::InvalidFileFormat(
210                "No compressed chunks in \
211                    file or broken header information"
212                    .into(),
213                None,
214            ));
215        }
216
217        // Compute number of possible chunks which would fit into the FEXTRA field; used for
218        // validity check. The first 10 bytes of FEXTRA are header information, the rest are 2-byte,
219        // little-endian numbers.
220        let numbers_chunks_which_would_fit = ((fextra.len() - 10) / 2) as u16; // each chunk represented by u16 == 2 bytes
221                                                                               // Check that number of claimed chunks fits within given size for subfield.
222        if numbers_chunks_which_would_fit != chunk_count {
223            return Err(DictError::InvalidFileFormat(
224                format!(
225                    "Expected {} chunks \
226                      according to dictzip header, but the FEXTRA field can \
227                      accomodate {}; possibly broken file",
228                    chunk_count, numbers_chunks_which_would_fit
229                ),
230                None,
231            ));
232        }
233
234        // If file name bit set, seek beyond the 0-terminated file name, we don't care.
235        if (flags & GZ_FNAME) != 0 {
236            let mut tmp = Vec::new();
237            buffered_dzdict.read_until(b'\0', &mut tmp)?;
238        }
239
240        // Seek past comment, if any.
241        if (flags & GZ_COMMENT) != 0 {
242            let mut tmp = Vec::new();
243            buffered_dzdict.read_until(b'\0', &mut tmp)?;
244        }
245
246        // Skip CRC stuff, 2 bytes.
247        if (flags & GZ_FHCRC) != 0 {
248            buffered_dzdict.seek(SeekFrom::Current(2))?;
249        }
250
251        // Save length of each compressed chunk.
252        let mut chunk_offsets = Vec::with_capacity(chunk_count as usize);
253        // Save position of last compressed byte (this is NOT EOF, could be followed by CRC checksum).
254        let mut end_compressed_data = buffered_dzdict.seek(SeekFrom::Current(0))? as usize;
255        // After the various header bytes parsed above, the list of chunk lengths can be found (slice for easier indexing).
256        let chunks_from_header = &fextra[10usize..(10 + chunk_count * 2) as usize];
257
258        // Iterate over each 2nd byte, parse u16.
259        for index in (0..chunks_from_header.len()).filter(|i| (i % 2) == 0) {
260            let index = index as usize;
261            let compressed_len =
262                LittleEndian::read_u16(&chunks_from_header[index..(index + 2)]) as usize;
263            chunk_offsets.push(end_compressed_data);
264            end_compressed_data += compressed_len;
265        }
266        assert_eq!(chunk_offsets.len() as u16, chunk_count, "The read number of compressed chunks in \
267                the .dz file must be equivalent to the number of chunks actually found in the file.\n");
268
269        // Read uncompressed file length.
270        buffered_dzdict.seek(SeekFrom::Start(end_compressed_data as u64))?;
271        let uncompressed = buffered_dzdict.read_i32::<LittleEndian>()?;
272
273        Ok(DictReaderDz {
274            dzdict: buffered_dzdict.into_inner(),
275            chunk_offsets,
276            end_compressed_data,
277            uchunk_length: uchunk_length as usize,
278            ufile_length: uncompressed as u64,
279        })
280    }
281
282    #[cfg_attr(
283        feature = "tracing",
284        tracing::instrument(skip(self), fields(start_offset, length))
285    )]
286    fn get_chunks_for(&self, start_offset: u64, length: u64) -> Vec<Chunk> {
287        let mut chunks = Vec::new();
288        let start_chunk = start_offset as usize / self.uchunk_length;
289        let end_chunk = (start_offset + length) as usize / self.uchunk_length;
290        for id in start_chunk..=end_chunk {
291            let chunk_length = match self.chunk_offsets.get(id + 1) {
292                Some(next) => next - self.chunk_offsets[id],
293                None => self.end_compressed_data - self.chunk_offsets[id],
294            };
295            chunks.push(Chunk {
296                offset: self.chunk_offsets[id],
297                length: chunk_length,
298            });
299        }
300
301        chunks
302    }
303
304    // Inflate a dictdz chunk.
305    #[cfg_attr(feature = "tracing", tracing::instrument(skip(self), fields(data_len = data.len())))]
306    fn inflate(&self, data: Vec<u8>) -> Result<Vec<u8>, DictError> {
307        let mut decoder = flate2::Decompress::new(false);
308        let mut decoded = vec![0u8; self.uchunk_length];
309        decoder.decompress(
310            data.as_slice(),
311            decoded.as_mut_slice(),
312            flate2::FlushDecompress::None,
313        )?;
314        Ok(decoded)
315    }
316}
317
318impl<B: Read + Seek> DictReader for DictReaderDz<B> {
319    // Fetch definition from the dictionary.
320    #[cfg_attr(
321        feature = "tracing",
322        tracing::instrument(skip(self), fields(start_offset, length))
323    )]
324    fn fetch_definition(&mut self, start_offset: u64, length: u64) -> Result<String, DictError> {
325        if length > MAX_BYTES_FOR_BUFFER {
326            return Err(DictError::MemoryError);
327        }
328        if (start_offset + length) > self.ufile_length {
329            return Err(DictError::IoError(io::Error::new(
330                io::ErrorKind::UnexpectedEof,
331                "a \
332                      seek beyond the end of uncompressed data was requested",
333            )));
334        }
335        let mut data = Vec::new();
336        for chunk in self.get_chunks_for(start_offset, length) {
337            let pos = self.dzdict.seek(SeekFrom::Start(chunk.offset as u64))?;
338            if pos != (chunk.offset as u64) {
339                return Err(DictError::IoError(io::Error::new(
340                    io::ErrorKind::Other,
341                    format!(
342                        "attempted to seek to {} but new position is {}",
343                        chunk.offset, pos
344                    ),
345                )));
346            }
347            let mut definition = vec![0u8; chunk.length];
348            self.dzdict.read_exact(&mut definition)?;
349            data.push(self.inflate(definition)?);
350        }
351
352        // Cut definition, convert to string.
353        let cut_front = start_offset as usize % self.uchunk_length;
354        // Join the chunks to one vector, only keeping the content of the definition.
355        let data = match data.len() {
356            0 => panic!(),
357            1 => data[0][cut_front..cut_front + length as usize].to_vec(),
358            n => {
359                let mut tmp = data[0][cut_front..].to_vec();
360                // First vec has been inserted into tmp, therefore skip first and last chunk, too.
361                for text in data.iter().skip(1).take(n - 2) {
362                    tmp.extend_from_slice(text);
363                }
364                // Add last chunk to tmp, omitting stuff after word definition end.
365                let remaining_bytes = (length as usize + cut_front) % self.uchunk_length;
366                tmp.extend_from_slice(&data[n - 1][..remaining_bytes]);
367                tmp
368            }
369        };
370        Ok(String::from_utf8(data)?)
371    }
372}